unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,34 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.mongodb import SimpleMongoDBConfig
12
+
13
+
14
+ @dataclass
15
+ class MongoDBRunner(Runner):
16
+ connector_config: "SimpleMongoDBConfig"
17
+
18
+ def update_read_config(self):
19
+ hashed_dir_name = hashlib.sha256(
20
+ str(self.connector_config.access_config.uri).encode("utf-8"),
21
+ )
22
+ self.read_config.download_dir = update_download_dir_hash(
23
+ connector_name="mongodb",
24
+ read_config=self.read_config,
25
+ hashed_dir_name=hashed_dir_name,
26
+ logger=logger,
27
+ )
28
+
29
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
30
+ from unstructured_ingest.connector.mongodb import (
31
+ MongoDBSourceConnector,
32
+ )
33
+
34
+ return MongoDBSourceConnector
@@ -0,0 +1,61 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.notion.connector import SimpleNotionConfig
12
+
13
+
14
+ @dataclass
15
+ class NotionRunner(Runner):
16
+ connector_config: "SimpleNotionConfig"
17
+
18
+ def update_read_config(self):
19
+ if not self.connector_config.page_ids and not self.connector_config.database_ids:
20
+ raise ValueError("no page ids nor database ids provided")
21
+
22
+ if self.connector_config.page_ids and self.connector_config.database_ids:
23
+ hashed_dir_name = hashlib.sha256(
24
+ "{},{}".format(
25
+ ",".join(self.connector_config.page_ids),
26
+ ",".join(self.connector_config.database_ids),
27
+ ).encode("utf-8"),
28
+ )
29
+ elif self.connector_config.page_ids:
30
+ hashed_dir_name = hashlib.sha256(
31
+ ",".join(self.connector_config.page_ids).encode("utf-8"),
32
+ )
33
+ elif self.connector_config.database_ids:
34
+ hashed_dir_name = hashlib.sha256(
35
+ ",".join(self.connector_config.database_ids).encode("utf-8"),
36
+ )
37
+ else:
38
+ raise ValueError("could not create local cache directory name")
39
+
40
+ self.read_config.download_dir = update_download_dir_hash(
41
+ connector_name="notion",
42
+ read_config=self.read_config,
43
+ hashed_dir_name=hashed_dir_name,
44
+ logger=logger,
45
+ )
46
+
47
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
48
+ from unstructured_ingest.connector.notion.connector import (
49
+ NotionSourceConnector,
50
+ )
51
+
52
+ return NotionSourceConnector
53
+
54
+ def get_source_connector(self) -> BaseSourceConnector:
55
+ source_connector_cls = self.get_source_connector_cls()
56
+ return source_connector_cls(
57
+ processor_config=self.processor_config,
58
+ connector_config=self.connector_config,
59
+ read_config=self.read_config,
60
+ retry_strategy_config=self.retry_strategy_config,
61
+ )
@@ -0,0 +1,35 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.onedrive import SimpleOneDriveConfig
12
+
13
+
14
+ @dataclass
15
+ class OneDriveRunner(Runner):
16
+ connector_config: "SimpleOneDriveConfig"
17
+
18
+ def update_read_config(self):
19
+ hashed_dir_name = hashlib.sha256(
20
+ f"{self.connector_config.tenant}_{self.connector_config.user_pname}".encode("utf-8"),
21
+ )
22
+
23
+ self.read_config.download_dir = update_download_dir_hash(
24
+ connector_name="onedrive",
25
+ read_config=self.read_config,
26
+ hashed_dir_name=hashed_dir_name,
27
+ logger=logger,
28
+ )
29
+
30
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
+ from unstructured_ingest.connector.onedrive import (
32
+ OneDriveSourceConnector,
33
+ )
34
+
35
+ return OneDriveSourceConnector
@@ -0,0 +1,40 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.opensearch import SimpleOpenSearchConfig
12
+
13
+
14
+ @dataclass
15
+ class OpenSearchRunner(Runner):
16
+ connector_config: "SimpleOpenSearchConfig"
17
+
18
+ def update_read_config(self):
19
+ hashed_dir_name = hashlib.sha256(
20
+ "{}_{}".format(
21
+ ",".join(self.connector_config.access_config.hosts),
22
+ self.connector_config.index_name,
23
+ ).encode(
24
+ "utf-8",
25
+ ),
26
+ )
27
+
28
+ self.read_config.download_dir = update_download_dir_hash(
29
+ connector_name="opensearch",
30
+ read_config=self.read_config,
31
+ hashed_dir_name=hashed_dir_name,
32
+ logger=logger,
33
+ )
34
+
35
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
36
+ from unstructured_ingest.connector.opensearch import (
37
+ OpenSearchSourceConnector,
38
+ )
39
+
40
+ return OpenSearchSourceConnector
@@ -0,0 +1,33 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.outlook import SimpleOutlookConfig
12
+
13
+
14
+ @dataclass
15
+ class OutlookRunner(Runner):
16
+ connector_config: "SimpleOutlookConfig"
17
+
18
+ def update_read_config(self):
19
+ hashed_dir_name = hashlib.sha256(self.connector_config.user_email.encode("utf-8"))
20
+
21
+ self.read_config.download_dir = update_download_dir_hash(
22
+ connector_name="outlook",
23
+ read_config=self.read_config,
24
+ hashed_dir_name=hashed_dir_name,
25
+ logger=logger,
26
+ )
27
+
28
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
29
+ from unstructured_ingest.connector.outlook import (
30
+ OutlookSourceConnector,
31
+ )
32
+
33
+ return OutlookSourceConnector
@@ -0,0 +1,35 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.reddit import SimpleRedditConfig
12
+
13
+
14
+ @dataclass
15
+ class RedditRunner(Runner):
16
+ connector_config: "SimpleRedditConfig"
17
+
18
+ def update_read_config(self):
19
+ hashed_dir_name = hashlib.sha256(
20
+ self.connector_config.subreddit_name.encode("utf-8"),
21
+ )
22
+
23
+ self.read_config.download_dir = update_download_dir_hash(
24
+ connector_name="reddit",
25
+ read_config=self.read_config,
26
+ hashed_dir_name=hashed_dir_name,
27
+ logger=logger,
28
+ )
29
+
30
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
+ from unstructured_ingest.connector.reddit import (
32
+ RedditSourceConnector,
33
+ )
34
+
35
+ return RedditSourceConnector
@@ -0,0 +1,33 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.salesforce import SimpleSalesforceConfig
12
+
13
+
14
+ @dataclass
15
+ class SalesforceRunner(Runner):
16
+ connector_config: "SimpleSalesforceConfig"
17
+
18
+ def update_read_config(self):
19
+ hashed_dir_name = hashlib.sha256(self.connector_config.username.encode("utf-8"))
20
+
21
+ self.read_config.download_dir = update_download_dir_hash(
22
+ connector_name="salesforce",
23
+ read_config=self.read_config,
24
+ hashed_dir_name=hashed_dir_name,
25
+ logger=logger,
26
+ )
27
+
28
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
29
+ from unstructured_ingest.connector.salesforce import (
30
+ SalesforceSourceConnector,
31
+ )
32
+
33
+ return SalesforceSourceConnector
@@ -0,0 +1,35 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.sharepoint import SimpleSharepointConfig
12
+
13
+
14
+ @dataclass
15
+ class SharePointRunner(Runner):
16
+ connector_config: "SimpleSharepointConfig"
17
+
18
+ def update_read_config(self):
19
+ hashed_dir_name = hashlib.sha256(
20
+ f"{self.connector_config.site}_{self.connector_config.path}".encode("utf-8"),
21
+ )
22
+
23
+ self.read_config.download_dir = update_download_dir_hash(
24
+ connector_name="sharepoint",
25
+ read_config=self.read_config,
26
+ hashed_dir_name=hashed_dir_name,
27
+ logger=logger,
28
+ )
29
+
30
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
+ from unstructured_ingest.connector.sharepoint import (
32
+ SharepointSourceConnector,
33
+ )
34
+
35
+ return SharepointSourceConnector
@@ -0,0 +1,33 @@
1
+ import hashlib
2
+ import typing as t
3
+
4
+ from unstructured_ingest.interfaces import BaseSourceConnector
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.runner.base_runner import Runner
7
+ from unstructured_ingest.runner.utils import update_download_dir_hash
8
+
9
+ if t.TYPE_CHECKING:
10
+ from unstructured_ingest.connector.slack import SimpleSlackConfig
11
+
12
+
13
+ class SlackRunner(Runner):
14
+ connector_config: "SimpleSlackConfig"
15
+
16
+ def update_read_config(self):
17
+ hashed_dir_name = hashlib.sha256(
18
+ ",".join(self.connector_config.channels).encode("utf-8"),
19
+ )
20
+
21
+ self.read_config.download_dir = update_download_dir_hash(
22
+ connector_name="slack",
23
+ read_config=self.read_config,
24
+ hashed_dir_name=hashed_dir_name,
25
+ logger=logger,
26
+ )
27
+
28
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
29
+ from unstructured_ingest.connector.slack import (
30
+ SlackSourceConnector,
31
+ )
32
+
33
+ return SlackSourceConnector
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ from unstructured_ingest.interfaces import (
8
+ ReadConfig,
9
+ )
10
+
11
+
12
+ def update_download_dir_remote_url(
13
+ connector_name: str,
14
+ read_config: ReadConfig,
15
+ remote_url: str,
16
+ logger: logging.Logger,
17
+ ) -> str:
18
+ hashed_dir_name = hashlib.sha256(remote_url.encode("utf-8"))
19
+ return update_download_dir_hash(
20
+ connector_name=connector_name,
21
+ read_config=read_config,
22
+ hashed_dir_name=hashed_dir_name,
23
+ logger=logger,
24
+ )
25
+
26
+
27
+ def update_download_dir_hash(
28
+ connector_name: str,
29
+ read_config: ReadConfig,
30
+ hashed_dir_name: hashlib._Hash,
31
+ logger: logging.Logger,
32
+ ) -> str:
33
+ if not read_config.download_dir:
34
+ cache_path = Path.home() / ".cache" / "unstructured" / "ingest"
35
+ if not cache_path.exists():
36
+ cache_path.mkdir(parents=True, exist_ok=True)
37
+ download_dir = cache_path / connector_name / hashed_dir_name.hexdigest()[:10]
38
+ if read_config.preserve_downloads:
39
+ logger.warning(
40
+ f"Preserving downloaded files but download_dir is not specified,"
41
+ f" using {download_dir}",
42
+ )
43
+ new_download_dir = str(download_dir)
44
+ logger.debug(f"updating download directory to: {new_download_dir}")
45
+ else:
46
+ new_download_dir = read_config.download_dir
47
+ return new_download_dir
@@ -0,0 +1,35 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.wikipedia import SimpleWikipediaConfig
12
+
13
+
14
+ @dataclass
15
+ class WikipediaRunner(Runner):
16
+ connector_config: "SimpleWikipediaConfig"
17
+
18
+ def update_read_config(self):
19
+ hashed_dir_name = hashlib.sha256(
20
+ self.connector_config.page_title.encode("utf-8"),
21
+ )
22
+
23
+ self.read_config.download_dir = update_download_dir_hash(
24
+ connector_name="wikipedia",
25
+ read_config=self.read_config,
26
+ hashed_dir_name=hashed_dir_name,
27
+ logger=logger,
28
+ )
29
+
30
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
+ from unstructured_ingest.connector.wikipedia import (
32
+ WikipediaSourceConnector,
33
+ )
34
+
35
+ return WikipediaSourceConnector
@@ -0,0 +1,48 @@
1
+ import typing as t
2
+
3
+ from .astradb import AstraDBWriter
4
+ from .azure_ai_search import AzureAiSearchWriter
5
+ from .base_writer import Writer
6
+ from .chroma import ChromaWriter
7
+ from .clarifai import ClarifaiWriter
8
+ from .databricks_volumes import DatabricksVolumesWriter
9
+ from .delta_table import DeltaTableWriter
10
+ from .elasticsearch import ElasticsearchWriter
11
+ from .fsspec.azure import AzureWriter
12
+ from .fsspec.box import BoxWriter
13
+ from .fsspec.dropbox import DropboxWriter
14
+ from .fsspec.gcs import GcsWriter
15
+ from .fsspec.s3 import S3Writer
16
+ from .kafka import KafkaWriter
17
+ from .mongodb import MongodbWriter
18
+ from .opensearch import OpenSearchWriter
19
+ from .pinecone import PineconeWriter
20
+ from .qdrant import QdrantWriter
21
+ from .sql import SqlWriter
22
+ from .vectara import VectaraWriter
23
+ from .weaviate import WeaviateWriter
24
+
25
+ writer_map: t.Dict[str, t.Type[Writer]] = {
26
+ "astradb": AstraDBWriter,
27
+ "azure": AzureWriter,
28
+ "azure_ai_search": AzureAiSearchWriter,
29
+ "box": BoxWriter,
30
+ "chroma": ChromaWriter,
31
+ "clarifai": ClarifaiWriter,
32
+ "databricks_volumes": DatabricksVolumesWriter,
33
+ "delta_table": DeltaTableWriter,
34
+ "dropbox": DropboxWriter,
35
+ "elasticsearch": ElasticsearchWriter,
36
+ "gcs": GcsWriter,
37
+ "kafka": KafkaWriter,
38
+ "mongodb": MongodbWriter,
39
+ "opensearch": OpenSearchWriter,
40
+ "pinecone": PineconeWriter,
41
+ "qdrant": QdrantWriter,
42
+ "s3": S3Writer,
43
+ "sql": SqlWriter,
44
+ "vectara": VectaraWriter,
45
+ "weaviate": WeaviateWriter,
46
+ }
47
+
48
+ __all__ = ["writer_map"]
@@ -0,0 +1,22 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
5
+ from unstructured_ingest.interfaces import BaseDestinationConnector
6
+ from unstructured_ingest.runner.writers.base_writer import Writer
7
+
8
+ if t.TYPE_CHECKING:
9
+ from unstructured_ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
10
+
11
+
12
+ @dataclass
13
+ class AstraDBWriter(Writer, EnhancedDataClassJsonMixin):
14
+ write_config: "AstraDBWriteConfig"
15
+ connector_config: "SimpleAstraDBConfig"
16
+
17
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
18
+ from unstructured_ingest.connector.astradb import (
19
+ AstraDBDestinationConnector,
20
+ )
21
+
22
+ return AstraDBDestinationConnector
@@ -0,0 +1,24 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.azure_ai_search import (
9
+ AzureAISearchWriteConfig,
10
+ SimpleAzureAISearchStorageConfig,
11
+ )
12
+
13
+
14
+ @dataclass
15
+ class AzureAiSearchWriter(Writer):
16
+ connector_config: "SimpleAzureAISearchStorageConfig"
17
+ write_config: "AzureAISearchWriteConfig"
18
+
19
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
20
+ from unstructured_ingest.connector.azure_ai_search import (
21
+ AzureAISearchDestinationConnector,
22
+ )
23
+
24
+ return AzureAISearchDestinationConnector
@@ -0,0 +1,26 @@
1
+ import typing as t
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import (
6
+ BaseConnectorConfig,
7
+ BaseDestinationConnector,
8
+ WriteConfig,
9
+ )
10
+
11
+
12
+ @dataclass
13
+ class Writer(ABC):
14
+ connector_config: BaseConnectorConfig
15
+ write_config: WriteConfig
16
+
17
+ @abstractmethod
18
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
19
+ pass
20
+
21
+ def get_connector(self, **kwargs) -> BaseDestinationConnector:
22
+ connector_cls = self.get_connector_cls()
23
+ return connector_cls(
24
+ write_config=self.write_config,
25
+ connector_config=self.connector_config,
26
+ )
@@ -0,0 +1,22 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
5
+ from unstructured_ingest.interfaces import BaseDestinationConnector
6
+ from unstructured_ingest.runner.writers.base_writer import Writer
7
+
8
+ if t.TYPE_CHECKING:
9
+ from unstructured_ingest.connector.chroma import ChromaWriteConfig, SimpleChromaConfig
10
+
11
+
12
+ @dataclass
13
+ class ChromaWriter(Writer, EnhancedDataClassJsonMixin):
14
+ write_config: "ChromaWriteConfig"
15
+ connector_config: "SimpleChromaConfig"
16
+
17
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
18
+ from unstructured_ingest.connector.chroma import (
19
+ ChromaDestinationConnector,
20
+ )
21
+
22
+ return ChromaDestinationConnector
@@ -0,0 +1,19 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.clarifai import ClarifaiWriteConfig, SimpleClarifaiConfig
9
+
10
+
11
+ @dataclass
12
+ class ClarifaiWriter(Writer):
13
+ write_config: "ClarifaiWriteConfig"
14
+ connector_config: "SimpleClarifaiConfig"
15
+
16
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
+ from unstructured_ingest.connector.clarifai import ClarifaiDestinationConnector
18
+
19
+ return ClarifaiDestinationConnector
@@ -0,0 +1,25 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
5
+ from unstructured_ingest.interfaces import BaseDestinationConnector
6
+ from unstructured_ingest.runner.writers.base_writer import Writer
7
+
8
+ if t.TYPE_CHECKING:
9
+ from unstructured_ingest.connector.databricks_volumes import (
10
+ DatabricksVolumesWriteConfig,
11
+ SimpleDatabricksVolumesConfig,
12
+ )
13
+
14
+
15
+ @dataclass
16
+ class DatabricksVolumesWriter(Writer, EnhancedDataClassJsonMixin):
17
+ write_config: "DatabricksVolumesWriteConfig"
18
+ connector_config: "SimpleDatabricksVolumesConfig"
19
+
20
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
21
+ from unstructured_ingest.connector.databricks_volumes import (
22
+ DatabricksVolumesDestinationConnector,
23
+ )
24
+
25
+ return DatabricksVolumesDestinationConnector
@@ -0,0 +1,24 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.delta_table import (
9
+ DeltaTableWriteConfig,
10
+ SimpleDeltaTableConfig,
11
+ )
12
+
13
+
14
+ @dataclass
15
+ class DeltaTableWriter(Writer):
16
+ write_config: "DeltaTableWriteConfig"
17
+ connector_config: "SimpleDeltaTableConfig"
18
+
19
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
20
+ from unstructured_ingest.connector.delta_table import (
21
+ DeltaTableDestinationConnector,
22
+ )
23
+
24
+ return DeltaTableDestinationConnector
@@ -0,0 +1,24 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.elasticsearch import (
9
+ ElasticsearchWriteConfig,
10
+ SimpleElasticsearchConfig,
11
+ )
12
+
13
+
14
+ @dataclass
15
+ class ElasticsearchWriter(Writer):
16
+ connector_config: "SimpleElasticsearchConfig"
17
+ write_config: "ElasticsearchWriteConfig"
18
+
19
+ def get_connector_cls(self) -> BaseDestinationConnector:
20
+ from unstructured_ingest.connector.elasticsearch import (
21
+ ElasticsearchDestinationConnector,
22
+ )
23
+
24
+ return ElasticsearchDestinationConnector
File without changes