unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,134 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import os.path
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Optional
9
+
10
+ from unstructured_ingest.interfaces import ChunkingConfig, PartitionConfig
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.pipeline.interfaces import ReformatNode
13
+ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
14
+
15
+ if TYPE_CHECKING:
16
+ from unstructured.documents.elements import Element
17
+
18
+
19
+ @dataclass
20
+ class Chunker(ReformatNode):
21
+ """Implementation for the chunking node in the ingest pipeline.
22
+
23
+ Parameters
24
+ ----------
25
+ pipeline_context: PipelineContext (inherited from parent class)
26
+ chunking_config: ChunkingConfig
27
+ partition_config: PartitionConfig
28
+ """
29
+
30
+ chunking_config: ChunkingConfig
31
+ partition_config: PartitionConfig
32
+
33
+ def initialize(self):
34
+ logger.info(
35
+ f"Running chunking node. Chunking config: {self.chunking_config.to_json()}]",
36
+ )
37
+ super().initialize()
38
+
39
+ def create_hash(self) -> str:
40
+ hash_dict = self.chunking_config.to_dict()
41
+ return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
42
+
43
+ def run(self, elements_json: str) -> Optional[str]:
44
+ try:
45
+ elements_json_filename = os.path.basename(elements_json)
46
+ filename_ext = os.path.basename(elements_json_filename)
47
+ filename = os.path.splitext(filename_ext)[0]
48
+ hashed_filename = hashlib.sha256(
49
+ f"{self.create_hash()}{filename}".encode(),
50
+ ).hexdigest()[:32]
51
+ json_filename = f"{hashed_filename}.json"
52
+ json_path = (Path(self.get_path()) / json_filename).resolve()
53
+ self.pipeline_context.ingest_docs_map[hashed_filename] = (
54
+ self.pipeline_context.ingest_docs_map[filename]
55
+ )
56
+ if (
57
+ not self.pipeline_context.reprocess
58
+ and json_path.is_file()
59
+ and json_path.stat().st_size
60
+ ):
61
+ logger.debug(f"file exists: {json_path}, skipping chunking")
62
+ return str(json_path)
63
+
64
+ chunked_elements = self.chunk(elements_json)
65
+
66
+ # -- return if chunking_strategy is None --
67
+ if chunked_elements is None:
68
+ logger.info(f"chunking_strategy is None, skipping chunking for {filename_ext}")
69
+ return
70
+
71
+ element_dicts = [e.to_dict() for e in chunked_elements]
72
+ assign_and_map_hash_ids(elements=element_dicts)
73
+
74
+ with open(json_path, "w", encoding="utf8") as output_f:
75
+ logger.info(f"writing chunking content to {json_path}")
76
+ json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
77
+ return str(json_path)
78
+
79
+ except Exception as e:
80
+ if self.pipeline_context.raise_on_error:
81
+ raise
82
+ logger.error(f"failed to run chunking on file {elements_json}, {e}", exc_info=True)
83
+ return None
84
+
85
+ def get_path(self) -> Path:
86
+ return (Path(self.pipeline_context.work_dir) / "chunked").resolve()
87
+
88
+ def chunk(self, elements_json_file: str) -> Optional[list["Element"]]:
89
+ """Called by Chunker.run() to properly execute the defined chunking_strategy."""
90
+ # -- No chunking_strategy means no chunking --
91
+ if self.chunking_config.chunking_strategy is None:
92
+ return
93
+ # -- Chunk locally for open-source chunking strategies, even when partitioning remotely --
94
+ if self.chunking_config.chunking_strategy in ("basic", "by_title"):
95
+ from unstructured.chunking import dispatch
96
+ from unstructured.staging.base import elements_from_json
97
+
98
+ return dispatch.chunk(
99
+ elements=elements_from_json(filename=elements_json_file),
100
+ chunking_strategy=self.chunking_config.chunking_strategy,
101
+ combine_text_under_n_chars=self.chunking_config.combine_text_under_n_chars,
102
+ include_orig_elements=self.chunking_config.include_orig_elements,
103
+ max_characters=self.chunking_config.max_characters,
104
+ multipage_sections=self.chunking_config.multipage_sections,
105
+ new_after_n_chars=self.chunking_config.new_after_n_chars,
106
+ overlap=self.chunking_config.overlap,
107
+ overlap_all=self.chunking_config.overlap_all,
108
+ )
109
+ # -- Chunk remotely --
110
+ if self.partition_config.partition_by_api:
111
+ from unstructured.partition.api import partition_via_api
112
+
113
+ return partition_via_api(
114
+ filename=elements_json_file,
115
+ # -- NOTE(jennings): If api_key or api_url are None, partition_via_api will raise an
116
+ # -- error, which will be caught and logged by Chunker.run()
117
+ api_key=self.partition_config.api_key, # type: ignore
118
+ api_url=self.partition_config.partition_endpoint, # type: ignore
119
+ chunking_strategy=self.chunking_config.chunking_strategy,
120
+ combine_under_n_chars=self.chunking_config.combine_text_under_n_chars,
121
+ include_orig_elements=self.chunking_config.include_orig_elements,
122
+ max_characters=self.chunking_config.max_characters,
123
+ multipage_sections=self.chunking_config.multipage_sections,
124
+ new_after_n_chars=self.chunking_config.new_after_n_chars,
125
+ overlap=self.chunking_config.overlap,
126
+ overlap_all=self.chunking_config.overlap_all,
127
+ )
128
+ # -- Warn that the defined chunking_strategy is not locally available --
129
+ logger.warning(
130
+ f"There is no locally available chunking_strategy:"
131
+ f" {self.chunking_config.chunking_strategy}."
132
+ f" If trying to partition remotely, check that `partition_by_api`, `api_url`,"
133
+ f" and `api_key` are correctly defined."
134
+ )
@@ -0,0 +1,64 @@
1
+ import hashlib
2
+ import json
3
+ import os.path
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from unstructured_ingest.interfaces import (
9
+ EmbeddingConfig,
10
+ )
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.pipeline.interfaces import ReformatNode
13
+
14
+
15
+ @dataclass
16
+ class Embedder(ReformatNode):
17
+ embedder_config: EmbeddingConfig
18
+
19
+ def initialize(self):
20
+ logger.info(
21
+ f"Running embedding node. Embedding config: {self.embedder_config.to_json()}]",
22
+ )
23
+ super().initialize()
24
+
25
+ def create_hash(self) -> str:
26
+ hash_dict = self.embedder_config.to_dict()
27
+ return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
28
+
29
+ def run(self, elements_json: str) -> Optional[str]:
30
+ try:
31
+ elements_json_filename = os.path.basename(elements_json)
32
+ filename_ext = os.path.basename(elements_json_filename)
33
+ filename = os.path.splitext(filename_ext)[0]
34
+ hashed_filename = hashlib.sha256(
35
+ f"{self.create_hash()}{filename}".encode(),
36
+ ).hexdigest()[:32]
37
+ json_filename = f"{hashed_filename}.json"
38
+ json_path = (Path(self.get_path()) / json_filename).resolve()
39
+ self.pipeline_context.ingest_docs_map[hashed_filename] = (
40
+ self.pipeline_context.ingest_docs_map[filename]
41
+ )
42
+ if (
43
+ not self.pipeline_context.reprocess
44
+ and json_path.is_file()
45
+ and json_path.stat().st_size
46
+ ):
47
+ logger.debug(f"file exists: {json_path}, skipping embedding")
48
+ return str(json_path)
49
+ with open(elements_json) as f:
50
+ elements = json.load(f)
51
+ embedder = self.embedder_config.get_embedder()
52
+ element_dicts = embedder.embed_documents(elements=elements)
53
+ with open(json_path, "w", encoding="utf8") as output_f:
54
+ logger.info(f"writing embeddings content to {json_path}")
55
+ json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
56
+ return str(json_path)
57
+ except Exception as e:
58
+ if self.pipeline_context.raise_on_error:
59
+ raise
60
+ logger.error(f"failed to embed content from file {elements_json}, {e}", exc_info=True)
61
+ return None
62
+
63
+ def get_path(self) -> Path:
64
+ return (Path(self.pipeline_context.work_dir) / "embedded.py").resolve()
@@ -0,0 +1,77 @@
1
+ import os
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
6
+ from unstructured_ingest.interfaces import (
7
+ BaseIngestDocBatch,
8
+ BaseSessionHandle,
9
+ BaseSingleIngestDoc,
10
+ IngestDocSessionHandleMixin,
11
+ )
12
+ from unstructured_ingest.logger import logger
13
+ from unstructured_ingest.pipeline.interfaces import SourceNode
14
+
15
+ # module-level variable to store session handle
16
+ session_handle: t.Optional[BaseSessionHandle] = None
17
+
18
+
19
+ @dataclass
20
+ class Reader(SourceNode):
21
+ def get_single(self, doc: BaseSingleIngestDoc, ingest_doc_dict: dict) -> str:
22
+ if (
23
+ not self.read_config.re_download
24
+ and doc.filename.is_file()
25
+ and doc.filename.stat().st_size
26
+ ):
27
+ logger.info(f"file exists: {doc.filename}, skipping download")
28
+ # Still need to fetch metadata if file exists locally
29
+ doc.update_source_metadata()
30
+ else:
31
+ serialized_doc = doc.to_json(redact_sensitive=True)
32
+ logger.debug(f"fetching {serialized_doc} - PID: {os.getpid()}")
33
+ if self.retry_strategy:
34
+ self.retry_strategy(doc.get_file)
35
+ else:
36
+ doc.get_file()
37
+ for k, v in doc.to_dict().items():
38
+ ingest_doc_dict[k] = v
39
+ return doc.filename
40
+
41
+ def get_batch(self, doc_batch: BaseIngestDocBatch, ingest_doc_dict: dict) -> t.List[str]:
42
+ if self.retry_strategy:
43
+ self.retry_strategy(doc_batch.get_files)
44
+ else:
45
+ doc_batch.get_files()
46
+ for k, v in doc_batch.to_dict().items():
47
+ ingest_doc_dict[k] = v
48
+ return [doc.filename for doc in doc_batch.ingest_docs]
49
+
50
+ def run(self, ingest_doc_dict: dict) -> t.Optional[t.Union[str, t.List[str]]]:
51
+ try:
52
+ global session_handle
53
+ doc = create_ingest_doc_from_dict(ingest_doc_dict)
54
+ if isinstance(doc, IngestDocSessionHandleMixin):
55
+ if session_handle is None:
56
+ # create via doc.session_handle, which is a property that creates a
57
+ # session handle if one is not already defined
58
+ session_handle = doc.session_handle
59
+ else:
60
+ doc._session_handle = session_handle
61
+ if isinstance(doc, BaseSingleIngestDoc):
62
+ return self.get_single(doc=doc, ingest_doc_dict=ingest_doc_dict)
63
+ elif isinstance(doc, BaseIngestDocBatch):
64
+ return self.get_batch(doc_batch=doc, ingest_doc_dict=ingest_doc_dict)
65
+ else:
66
+ raise ValueError(
67
+ f"type of doc ({type(doc)}) is not a recognized type: "
68
+ f"BaseSingleIngestDoc or BaseSingleIngestDoc"
69
+ )
70
+ except Exception as e:
71
+ if self.pipeline_context.raise_on_error:
72
+ raise
73
+ logger.error(
74
+ f"failed to get data associated with source doc: {ingest_doc_dict}, {e}",
75
+ exc_info=True,
76
+ )
77
+ return None
@@ -0,0 +1,6 @@
1
+ import hashlib
2
+
3
+
4
+ def get_ingest_doc_hash(json_as_dict: dict) -> str:
5
+ hashed = hashlib.sha256(json_as_dict["unique_id"].encode()).hexdigest()[:32]
6
+ return hashed
@@ -0,0 +1,18 @@
1
+ import os.path
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
6
+ from unstructured_ingest.pipeline.interfaces import WriteNode
7
+
8
+
9
+ @dataclass
10
+ class Writer(WriteNode):
11
+ def run(self, json_paths: t.List[str]):
12
+ ingest_docs = []
13
+ for json_path in json_paths:
14
+ filename = os.path.basename(json_path)
15
+ doc_hash = os.path.splitext(filename)[0]
16
+ ingest_doc_dict = self.pipeline_context.ingest_docs_map[doc_hash]
17
+ ingest_docs.append(create_ingest_doc_from_dict(ingest_doc_dict))
18
+ self.dest_doc_connector.write(docs=ingest_docs)
@@ -0,0 +1,93 @@
1
+ from __future__ import annotations
2
+
3
+ import multiprocessing as mp
4
+ from contextlib import suppress
5
+ from typing import Optional
6
+
7
+ from unstructured_ingest.interfaces import (
8
+ BaseDestinationConnector,
9
+ BaseSourceConnector,
10
+ ChunkingConfig,
11
+ EmbeddingConfig,
12
+ PartitionConfig,
13
+ PermissionsConfig,
14
+ ProcessorConfig,
15
+ RetryStrategyConfig,
16
+ )
17
+ from unstructured_ingest.pipeline import (
18
+ Chunker,
19
+ DocFactory,
20
+ Embedder,
21
+ Partitioner,
22
+ PermissionsDataCleaner,
23
+ Pipeline,
24
+ PipelineContext,
25
+ Reader,
26
+ ReformatNode,
27
+ Writer,
28
+ )
29
+
30
+ with suppress(RuntimeError):
31
+ mp.set_start_method("spawn")
32
+
33
+
34
+ def process_documents(
35
+ processor_config: ProcessorConfig,
36
+ source_doc_connector: BaseSourceConnector,
37
+ partition_config: PartitionConfig,
38
+ dest_doc_connector: Optional[BaseDestinationConnector] = None,
39
+ chunking_config: Optional[ChunkingConfig] = None,
40
+ embedder_config: Optional[EmbeddingConfig] = None,
41
+ permissions_config: Optional[PermissionsConfig] = None,
42
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
43
+ ) -> None:
44
+ pipeline_config = PipelineContext.from_dict(processor_config.to_dict())
45
+ doc_factory = DocFactory(
46
+ pipeline_context=pipeline_config,
47
+ source_doc_connector=source_doc_connector,
48
+ )
49
+ reader = Reader(
50
+ pipeline_context=pipeline_config,
51
+ retry_strategy_config=retry_strategy_config,
52
+ read_config=source_doc_connector.read_config,
53
+ )
54
+ partitioner = Partitioner(pipeline_context=pipeline_config, partition_config=partition_config)
55
+ reformat_nodes: list[ReformatNode] = []
56
+ if chunking_config:
57
+ reformat_nodes.append(
58
+ Chunker(
59
+ pipeline_context=pipeline_config,
60
+ chunking_config=chunking_config,
61
+ partition_config=partition_config,
62
+ ),
63
+ )
64
+ if embedder_config:
65
+ reformat_nodes.append(
66
+ Embedder(
67
+ pipeline_context=pipeline_config,
68
+ embedder_config=embedder_config,
69
+ ),
70
+ )
71
+ writer = (
72
+ Writer(
73
+ pipeline_context=pipeline_config,
74
+ dest_doc_connector=dest_doc_connector,
75
+ )
76
+ if dest_doc_connector
77
+ else None
78
+ )
79
+ permissions_data_cleaner = (
80
+ PermissionsDataCleaner(pipeline_context=pipeline_config, processor_config=processor_config)
81
+ if permissions_config
82
+ else None
83
+ )
84
+ pipeline = Pipeline(
85
+ pipeline_context=pipeline_config,
86
+ doc_factory_node=doc_factory,
87
+ source_node=reader,
88
+ partition_node=partitioner,
89
+ reformat_nodes=reformat_nodes,
90
+ write_node=writer,
91
+ permissions_node=permissions_data_cleaner,
92
+ )
93
+ pipeline.run()
@@ -0,0 +1,104 @@
1
+ import typing as t
2
+ from typing import Type
3
+
4
+ from .airtable import AirtableRunner
5
+ from .astradb import AstraDBRunner
6
+ from .base_runner import Runner
7
+ from .biomed import BiomedRunner
8
+ from .confluence import ConfluenceRunner
9
+ from .delta_table import DeltaTableRunner
10
+ from .discord import DiscordRunner
11
+ from .elasticsearch import ElasticSearchRunner
12
+ from .fsspec.azure import AzureRunner
13
+ from .fsspec.box import BoxRunner
14
+ from .fsspec.dropbox import DropboxRunner
15
+ from .fsspec.fsspec import FsspecRunner
16
+ from .fsspec.gcs import GCSRunner
17
+ from .fsspec.s3 import S3Runner
18
+ from .fsspec.sftp import SftpRunner
19
+ from .github import GithubRunner
20
+ from .gitlab import GitlabRunner
21
+ from .google_drive import GoogleDriveRunner
22
+ from .hubspot import HubSpotRunner
23
+ from .jira import JiraRunner
24
+ from .kafka import KafkaRunner
25
+ from .local import LocalRunner
26
+ from .mongodb import MongoDBRunner
27
+ from .notion import NotionRunner
28
+ from .onedrive import OneDriveRunner
29
+ from .opensearch import OpenSearchRunner
30
+ from .outlook import OutlookRunner
31
+ from .reddit import RedditRunner
32
+ from .salesforce import SalesforceRunner
33
+ from .sharepoint import SharePointRunner
34
+ from .slack import SlackRunner
35
+ from .wikipedia import WikipediaRunner
36
+
37
+ runner_map: t.Dict[str, Type[Runner]] = {
38
+ "airtable": AirtableRunner,
39
+ "astradb": AstraDBRunner,
40
+ "azure": AzureRunner,
41
+ "biomed": BiomedRunner,
42
+ "box": BoxRunner,
43
+ "confluence": ConfluenceRunner,
44
+ "delta_table": DeltaTableRunner,
45
+ "discord": DiscordRunner,
46
+ "dropbox": DropboxRunner,
47
+ "elasticsearch": ElasticSearchRunner,
48
+ "fsspec": FsspecRunner,
49
+ "gcs": GCSRunner,
50
+ "github": GithubRunner,
51
+ "gitlab": GitlabRunner,
52
+ "gdrive": GoogleDriveRunner,
53
+ "google_drive": GoogleDriveRunner,
54
+ "hubspot": HubSpotRunner,
55
+ "jira": JiraRunner,
56
+ "kafka": KafkaRunner,
57
+ "local": LocalRunner,
58
+ "mongodb": MongoDBRunner,
59
+ "notion": NotionRunner,
60
+ "onedrive": OneDriveRunner,
61
+ "opensearch": OpenSearchRunner,
62
+ "outlook": OutlookRunner,
63
+ "reddit": RedditRunner,
64
+ "s3": S3Runner,
65
+ "salesforce": SalesforceRunner,
66
+ "sftp": SftpRunner,
67
+ "sharepoint": SharePointRunner,
68
+ "slack": SlackRunner,
69
+ "wikipedia": WikipediaRunner,
70
+ }
71
+
72
+ __all__ = [
73
+ "AirtableRunner",
74
+ "AstraRunner",
75
+ "AzureRunner",
76
+ "BiomedRunner",
77
+ "BoxRunner",
78
+ "ConfluenceRunner",
79
+ "DeltaTableRunner",
80
+ "DiscordRunner",
81
+ "DropboxRunner",
82
+ "ElasticSearchRunner",
83
+ "FsspecRunner",
84
+ "GCSRunner",
85
+ "GoogleDriveRunner",
86
+ "GithubRunner",
87
+ "GitlabRunner",
88
+ "JiraRunner",
89
+ "KafkaRunner",
90
+ "LocalRunner",
91
+ "MongoDBRunner",
92
+ "NotionRunner",
93
+ "OneDriveRunner",
94
+ "OpenSearchRunner",
95
+ "OutlookRunner",
96
+ "RedditRunner",
97
+ "S3Runner",
98
+ "SalesforceRunner",
99
+ "SharePointRunner",
100
+ "SlackRunner",
101
+ "WikipediaRunner",
102
+ "runner_map",
103
+ "Runner",
104
+ ]
@@ -0,0 +1,35 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.airtable import SimpleAirtableConfig
12
+
13
+
14
+ @dataclass
15
+ class AirtableRunner(Runner):
16
+ connector_config: "SimpleAirtableConfig"
17
+
18
+ def update_read_config(self):
19
+ hashed_dir_name = hashlib.sha256(
20
+ self.connector_config.access_config.personal_access_token.encode("utf-8"),
21
+ )
22
+
23
+ self.read_config.download_dir = update_download_dir_hash(
24
+ connector_name="airtable",
25
+ read_config=self.read_config,
26
+ hashed_dir_name=hashed_dir_name,
27
+ logger=logger,
28
+ )
29
+
30
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
+ from unstructured_ingest.connector.airtable import (
32
+ AirtableSourceConnector,
33
+ )
34
+
35
+ return AirtableSourceConnector
@@ -0,0 +1,34 @@
1
+ import hashlib
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ from unstructured_ingest.interfaces import BaseSourceConnector
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.runner.base_runner import Runner
8
+ from unstructured_ingest.runner.utils import update_download_dir_hash
9
+
10
+ if t.TYPE_CHECKING:
11
+ from unstructured_ingest.connector.astradb import SimpleAstraDBConfig
12
+
13
+
14
+ @dataclass
15
+ class AstraDBRunner(Runner):
16
+ connector_config: "SimpleAstraDBConfig"
17
+
18
+ def update_read_config(self):
19
+ hashed_dir_name = hashlib.sha256(
20
+ str(self.connector_config.access_config.api_endpoint).encode("utf-8"),
21
+ )
22
+ self.read_config.download_dir = update_download_dir_hash(
23
+ connector_name="astradb",
24
+ read_config=self.read_config,
25
+ hashed_dir_name=hashed_dir_name,
26
+ logger=logger,
27
+ )
28
+
29
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
30
+ from unstructured_ingest.connector.astradb import (
31
+ AstraDBSourceConnector,
32
+ )
33
+
34
+ return AstraDBSourceConnector
@@ -0,0 +1,89 @@
1
+ import logging
2
+ import typing as t
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+
6
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
7
+ from unstructured_ingest.interfaces import (
8
+ BaseConnectorConfig,
9
+ BaseDestinationConnector,
10
+ BaseSourceConnector,
11
+ ChunkingConfig,
12
+ EmbeddingConfig,
13
+ PartitionConfig,
14
+ PermissionsConfig,
15
+ ProcessorConfig,
16
+ ReadConfig,
17
+ RetryStrategyConfig,
18
+ )
19
+ from unstructured_ingest.logger import ingest_log_streaming_init
20
+ from unstructured_ingest.processor import process_documents
21
+ from unstructured_ingest.runner.writers.base_writer import Writer
22
+
23
+
24
+ @dataclass
25
+ class Runner(EnhancedDataClassJsonMixin, ABC):
26
+ connector_config: BaseConnectorConfig
27
+ processor_config: ProcessorConfig
28
+ read_config: ReadConfig
29
+ partition_config: PartitionConfig
30
+ writer: t.Optional[Writer] = None
31
+ writer_kwargs: t.Optional[dict] = None
32
+ embedding_config: t.Optional[EmbeddingConfig] = None
33
+ chunking_config: t.Optional[ChunkingConfig] = None
34
+ permissions_config: t.Optional[PermissionsConfig] = None
35
+ retry_strategy_config: t.Optional[RetryStrategyConfig] = None
36
+
37
+ def run(self, *args, **kwargs):
38
+ ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO)
39
+ self.update_read_config()
40
+ source_connector = self.get_source_connector()
41
+ self.process_documents(
42
+ source_doc_connector=source_connector,
43
+ )
44
+
45
+ @abstractmethod
46
+ def update_read_config(self):
47
+ pass
48
+
49
+ @abstractmethod
50
+ def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
51
+ pass
52
+
53
+ def get_source_connector(self) -> BaseSourceConnector:
54
+ source_connector_cls = self.get_source_connector_cls()
55
+ return source_connector_cls(
56
+ processor_config=self.processor_config,
57
+ connector_config=self.connector_config,
58
+ read_config=self.read_config,
59
+ )
60
+
61
+ def get_dest_doc_connector(self) -> t.Optional[BaseDestinationConnector]:
62
+ writer_kwargs = self.writer_kwargs if self.writer_kwargs else {}
63
+ if self.writer:
64
+ return self.writer.get_connector(**writer_kwargs)
65
+ return None
66
+
67
+ def get_permissions_config(self) -> t.Optional[PermissionsConfig]:
68
+ if self.permissions_config is None:
69
+ return None
70
+
71
+ permissions_config_filled = bool(
72
+ self.permissions_config.application_id
73
+ and self.permissions_config.client_cred
74
+ and self.permissions_config.tenant,
75
+ )
76
+
77
+ return self.permissions_config if permissions_config_filled else None
78
+
79
+ def process_documents(self, source_doc_connector: BaseSourceConnector):
80
+ process_documents(
81
+ processor_config=self.processor_config,
82
+ source_doc_connector=source_doc_connector,
83
+ partition_config=self.partition_config,
84
+ dest_doc_connector=self.get_dest_doc_connector(),
85
+ embedder_config=self.embedding_config,
86
+ chunking_config=self.chunking_config,
87
+ permissions_config=self.get_permissions_config(),
88
+ retry_strategy_config=self.retry_strategy_config,
89
+ )