unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,79 @@
1
+ import asyncio
2
+ import hashlib
3
+ import json
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Callable, Optional, TypedDict
7
+
8
+ from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
+ from unstructured_ingest.v2.logger import logger
11
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
+ from unstructured_ingest.v2.processes.embedder import Embedder
13
+ from unstructured_ingest.v2.utils import serialize_base_model_json
14
+
15
+ STEP_ID = "embed"
16
+
17
+
18
+ class EmbedStepResponse(TypedDict):
19
+ file_data_path: str
20
+ path: str
21
+
22
+
23
+ @dataclass
24
+ class EmbedStep(PipelineStep):
25
+ process: Embedder
26
+ identifier: str = STEP_ID
27
+
28
+ def __str__(self):
29
+ return f"{self.identifier} ({self.process.config.embedding_provider})"
30
+
31
+ def __post_init__(self):
32
+ config = self.process.config.model_dump_json() if self.process.config else None
33
+ logger.info(f"created {self.identifier} with configs: {config}")
34
+
35
+ def should_embed(self, filepath: Path, file_data: FileData) -> bool:
36
+ if self.context.reprocess or file_data.reprocess:
37
+ return True
38
+ return not filepath.exists()
39
+
40
+ def get_output_filepath(self, filename: Path) -> Path:
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
42
+ filepath = (self.cache_dir / hashed_output_file).resolve()
43
+ filepath.parent.mkdir(parents=True, exist_ok=True)
44
+ return filepath
45
+
46
+ def _save_output(self, output_filepath: str, embedded_content: list[dict]):
47
+ with open(str(output_filepath), "w") as f:
48
+ logger.debug(f"writing embedded output to: {output_filepath}")
49
+ json.dump(embedded_content, f, indent=2)
50
+
51
+ async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
52
+ path = Path(path)
53
+ file_data = file_data_from_file(path=file_data_path)
54
+ output_filepath = self.get_output_filepath(filename=path)
55
+ if not self.should_embed(filepath=output_filepath, file_data=file_data):
56
+ logger.debug(f"skipping embedding, output already exists: {output_filepath}")
57
+ return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
58
+ fn_kwargs = {"elements_filepath": path}
59
+ if not asyncio.iscoroutinefunction(fn):
60
+ embed_content_raw = fn(**fn_kwargs)
61
+ elif semaphore := self.context.semaphore:
62
+ async with semaphore:
63
+ embed_content_raw = await fn(**fn_kwargs)
64
+ else:
65
+ embed_content_raw = await fn(**fn_kwargs)
66
+
67
+ self._save_output(
68
+ output_filepath=str(output_filepath),
69
+ embedded_content=embed_content_raw,
70
+ )
71
+ return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
72
+
73
+ def get_hash(self, extras: Optional[list[str]]) -> str:
74
+ hashable_string = serialize_base_model_json(
75
+ model=self.process.config, sort_keys=True, ensure_ascii=True
76
+ )
77
+ if extras:
78
+ hashable_string += "".join(extras)
79
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -0,0 +1,35 @@
1
+ import asyncio
2
+ from dataclasses import dataclass
3
+ from typing import Callable, Optional
4
+
5
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
6
+ from unstructured_ingest.v2.logger import logger
7
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
+ from unstructured_ingest.v2.processes.filter import Filterer
9
+
10
+ STEP_ID = "filter"
11
+
12
+
13
+ @dataclass
14
+ class FilterStep(PipelineStep):
15
+ process: Filterer
16
+ identifier: str = STEP_ID
17
+
18
+ def __post_init__(self):
19
+ config = self.process.config.model_dump_json() if self.process.config else None
20
+ logger.info(f"created {self.identifier} with configs: {config}")
21
+
22
+ async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
23
+ file_data = file_data_from_file(path=file_data_path)
24
+ fn_kwargs = {"file_data": file_data}
25
+ if not asyncio.iscoroutinefunction(fn):
26
+ resp = fn(**fn_kwargs)
27
+ elif semaphore := self.context.semaphore:
28
+ async with semaphore:
29
+ resp = await fn(**fn_kwargs)
30
+ else:
31
+ resp = await fn(**fn_kwargs)
32
+
33
+ if resp:
34
+ return {"file_data_path": file_data_path}
35
+ return None
@@ -0,0 +1,86 @@
1
+ import hashlib
2
+ import json
3
+ from dataclasses import dataclass
4
+ from typing import AsyncGenerator, Generator, Optional, TypeVar
5
+
6
+ from unstructured_ingest.v2.interfaces.indexer import Indexer
7
+ from unstructured_ingest.v2.logger import logger
8
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
+ from unstructured_ingest.v2.pipeline.otel import instrument
10
+ from unstructured_ingest.v2.utils import serialize_base_model_json
11
+
12
+ IndexerT = TypeVar("IndexerT", bound=Indexer)
13
+
14
+ STEP_ID = "index"
15
+
16
+
17
+ @dataclass
18
+ class IndexStep(PipelineStep):
19
+ process: IndexerT
20
+ identifier: str = STEP_ID
21
+
22
+ def __str__(self):
23
+ return f"{self.identifier} ({self.process.__class__.__name__})"
24
+
25
+ def __post_init__(self):
26
+ config = self.process.index_config.model_dump_json() if self.process.index_config else None
27
+ connection_config = (
28
+ self.process.connection_config.model_dump_json()
29
+ if self.process.connection_config
30
+ else None
31
+ )
32
+ logger.info(
33
+ f"created {self.identifier} with configs: {config}, "
34
+ f"connection configs: {connection_config}"
35
+ )
36
+
37
+ @instrument(span_name=STEP_ID)
38
+ def run(self) -> Generator[str, None, None]:
39
+ for file_data in self.process.run():
40
+ logger.debug(f"generated file data: {file_data.model_dump()}")
41
+ try:
42
+ record_hash = self.get_hash(extras=[file_data.identifier])
43
+ filename = f"{record_hash}.json"
44
+ filepath = (self.cache_dir / filename).resolve()
45
+ filepath.parent.mkdir(parents=True, exist_ok=True)
46
+ with open(str(filepath), "w") as f:
47
+ json.dump(file_data.model_dump(), f, indent=2)
48
+ yield str(filepath)
49
+ except Exception as e:
50
+ logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
51
+ if self.context.raise_on_error:
52
+ raise e
53
+ continue
54
+
55
+ async def run_async(self) -> AsyncGenerator[str, None]:
56
+ async for file_data in self.process.run_async():
57
+ logger.debug(f"generated file data: {file_data.model_dump()}")
58
+ try:
59
+ record_hash = self.get_hash(extras=[file_data.identifier])
60
+ filename = f"{record_hash}.json"
61
+ filepath = (self.cache_dir / filename).resolve()
62
+ filepath.parent.mkdir(parents=True, exist_ok=True)
63
+ with open(str(filepath), "w") as f:
64
+ json.dump(file_data.model_dump(), f, indent=2)
65
+ yield str(filepath)
66
+ except Exception as e:
67
+ logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
68
+ if self.context.raise_on_error:
69
+ raise e
70
+ continue
71
+
72
+ def get_hash(self, extras: Optional[list[str]]) -> str:
73
+ index_config_dict = json.loads(
74
+ serialize_base_model_json(model=self.process.index_config, sort_keys=True)
75
+ )
76
+ connection_config_dict = json.loads(
77
+ serialize_base_model_json(model=self.process.connection_config, sort_keys=True)
78
+ )
79
+ hashable_dict = {
80
+ "index_config": index_config_dict,
81
+ "connection_config": connection_config_dict,
82
+ }
83
+ hashable_string = json.dumps(hashable_dict, sort_keys=True)
84
+ if extras:
85
+ hashable_string += "".join(extras)
86
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -0,0 +1,79 @@
1
+ import asyncio
2
+ import hashlib
3
+ import json
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Callable, Optional, TypedDict
7
+
8
+ from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
+ from unstructured_ingest.v2.logger import logger
11
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
+ from unstructured_ingest.v2.processes.partitioner import Partitioner
13
+ from unstructured_ingest.v2.utils import serialize_base_model_json
14
+
15
+ STEP_ID = "partition"
16
+
17
+
18
+ class PartitionStepResponse(TypedDict):
19
+ file_data_path: str
20
+ path: str
21
+
22
+
23
+ @dataclass
24
+ class PartitionStep(PipelineStep):
25
+ process: Partitioner
26
+ identifier: str = STEP_ID
27
+
28
+ def __str__(self):
29
+ return f"{self.identifier} ({self.process.config.strategy})"
30
+
31
+ def __post_init__(self):
32
+ config = self.process.config.model_dump_json()
33
+ logger.info(f"created {self.identifier} with configs: {config}")
34
+
35
+ def should_partition(self, filepath: Path, file_data: FileData) -> bool:
36
+ if self.context.reprocess or file_data.reprocess:
37
+ return True
38
+ return not filepath.exists()
39
+
40
+ def get_output_filepath(self, filename: Path) -> Path:
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
42
+ filepath = (self.cache_dir / hashed_output_file).resolve()
43
+ filepath.parent.mkdir(parents=True, exist_ok=True)
44
+ return filepath
45
+
46
+ def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
47
+ with open(str(output_filepath), "w") as f:
48
+ logger.debug(f"writing partitioned output to: {output_filepath}")
49
+ json.dump(partitioned_content, f, indent=2)
50
+
51
+ async def _run_async(
52
+ self, fn: Callable, path: str, file_data_path: str
53
+ ) -> Optional[PartitionStepResponse]:
54
+ path = Path(path)
55
+ file_data = file_data_from_file(path=file_data_path)
56
+ output_filepath = self.get_output_filepath(filename=Path(file_data_path))
57
+ if not self.should_partition(filepath=output_filepath, file_data=file_data):
58
+ logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
59
+ return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
60
+ fn_kwargs = {"filename": path, "metadata": file_data.metadata.model_dump()}
61
+ if not asyncio.iscoroutinefunction(fn):
62
+ partitioned_content = fn(**fn_kwargs)
63
+ elif semaphore := self.context.semaphore:
64
+ async with semaphore:
65
+ partitioned_content = await fn(**fn_kwargs)
66
+ else:
67
+ partitioned_content = await fn(**fn_kwargs)
68
+ self._save_output(
69
+ output_filepath=str(output_filepath), partitioned_content=partitioned_content
70
+ )
71
+ return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
72
+
73
+ def get_hash(self, extras: Optional[list[str]]) -> str:
74
+ hashable_string = serialize_base_model_json(
75
+ model=self.process.config, sort_keys=True, ensure_ascii=True
76
+ )
77
+ if extras:
78
+ hashable_string += "".join(extras)
79
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -0,0 +1,65 @@
1
+ import asyncio
2
+ import hashlib
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Callable, Optional, TypedDict
6
+
7
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
8
+ from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
9
+ from unstructured_ingest.v2.logger import logger
10
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
+ from unstructured_ingest.v2.utils import serialize_base_model_json
12
+
13
+ STEP_ID = "upload_stage"
14
+
15
+
16
+ class UploadStageStepResponse(TypedDict):
17
+ file_data_path: str
18
+ path: str
19
+
20
+
21
+ @dataclass
22
+ class UploadStageStep(PipelineStep):
23
+ process: UploadStager
24
+ identifier: str = STEP_ID
25
+
26
+ def __str__(self):
27
+ return f"{self.identifier} ({self.process.__class__.__name__})"
28
+
29
+ def __post_init__(self):
30
+ config = (
31
+ self.process.upload_stager_config.model_dump_json()
32
+ if self.process.upload_stager_config
33
+ else None
34
+ )
35
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
36
+ logger.info(f"created {self.identifier} with configs: {config}")
37
+
38
+ async def _run_async(
39
+ self, fn: Callable, path: str, file_data_path: str
40
+ ) -> UploadStageStepResponse:
41
+ path = Path(path)
42
+ # Maintain extension
43
+ output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
44
+ fn_kwargs = {
45
+ "elements_filepath": path,
46
+ "file_data": file_data_from_file(path=file_data_path),
47
+ "output_dir": self.cache_dir,
48
+ "output_filename": output_filename,
49
+ }
50
+ if not asyncio.iscoroutinefunction(fn):
51
+ staged_output_path = fn(**fn_kwargs)
52
+ elif semaphore := self.context.semaphore:
53
+ async with semaphore:
54
+ staged_output_path = await fn(**fn_kwargs)
55
+ else:
56
+ staged_output_path = await fn(**fn_kwargs)
57
+ return UploadStageStepResponse(file_data_path=file_data_path, path=str(staged_output_path))
58
+
59
+ def get_hash(self, extras: Optional[list[str]]) -> str:
60
+ hashable_string = serialize_base_model_json(
61
+ model=self.process.upload_stager_config, sort_keys=True, ensure_ascii=True
62
+ )
63
+ if extras:
64
+ hashable_string += "".join(extras)
65
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -0,0 +1,50 @@
1
+ import asyncio
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Callable, TypedDict
5
+
6
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
7
+ from unstructured_ingest.v2.logger import logger
8
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
+ from unstructured_ingest.v2.processes.uncompress import Uncompressor
10
+
11
+ STEP_ID = "uncompress"
12
+
13
+
14
+ class UncompressStepResponse(TypedDict):
15
+ file_data_path: str
16
+ path: str
17
+
18
+
19
+ @dataclass
20
+ class UncompressStep(PipelineStep):
21
+ process: Uncompressor
22
+ identifier: str = STEP_ID
23
+
24
+ def __post_init__(self):
25
+ config = self.process.config.model_dump_json() if self.process.config else None
26
+ logger.info(f"created {self.identifier} with configs: {config}")
27
+
28
+ async def _run_async(
29
+ self, fn: Callable, path: str, file_data_path: str
30
+ ) -> list[UncompressStepResponse]:
31
+ file_data = file_data_from_file(path=file_data_path)
32
+ fn_kwargs = {"file_data": file_data}
33
+ if not asyncio.iscoroutinefunction(fn):
34
+ new_file_data = fn(**fn_kwargs)
35
+ elif semaphore := self.context.semaphore:
36
+ async with semaphore:
37
+ new_file_data = await fn(**fn_kwargs)
38
+ else:
39
+ new_file_data = await fn(**fn_kwargs)
40
+ responses = []
41
+ for new_file in new_file_data:
42
+ new_file_data_path = Path(file_data_path).parent / f"{new_file.identifier}.json"
43
+ new_file.to_file(path=str(new_file_data_path.resolve()))
44
+ responses.append(
45
+ UncompressStepResponse(
46
+ path=new_file.local_download_path,
47
+ file_data_path=str(new_file_data_path),
48
+ )
49
+ )
50
+ return responses
@@ -0,0 +1,58 @@
1
+ import asyncio
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Callable, Optional, TypedDict
5
+
6
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
7
+ from unstructured_ingest.v2.interfaces.uploader import UploadContent
8
+ from unstructured_ingest.v2.logger import logger
9
+ from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
10
+ from unstructured_ingest.v2.pipeline.otel import instrument
11
+
12
+ STEP_ID = "upload"
13
+
14
+
15
+ class UploadStepContent(TypedDict):
16
+ path: str
17
+ file_data_path: str
18
+
19
+
20
+ @dataclass
21
+ class UploadStep(BatchPipelineStep):
22
+ identifier: str = STEP_ID
23
+
24
+ def __str__(self):
25
+ return f"{self.identifier} ({self.process.__class__.__name__})"
26
+
27
+ def __post_init__(self):
28
+ config = (
29
+ self.process.upload_config.model_dump_json() if self.process.upload_config else None
30
+ )
31
+ connection_config = (
32
+ self.process.connection_config.model_dump_json()
33
+ if self.process.connection_config
34
+ else None
35
+ )
36
+ logger.info(
37
+ f"Created {self.identifier} with configs: {config}, "
38
+ f"connection configs: {connection_config}"
39
+ )
40
+
41
+ @instrument(span_name=STEP_ID)
42
+ def _run_batch(self, contents: list[UploadStepContent]) -> None:
43
+ upload_contents = [
44
+ UploadContent(path=Path(c["path"]), file_data=file_data_from_file(c["file_data_path"]))
45
+ for c in contents
46
+ ]
47
+ self.process.run_batch(contents=upload_contents)
48
+
49
+ async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
50
+ fn = fn or self.process.run_async
51
+ fn_kwargs = {"path": Path(path), "file_data": file_data_from_file(path=file_data_path)}
52
+ if not asyncio.iscoroutinefunction(fn):
53
+ fn(**fn_kwargs)
54
+ elif semaphore := self.context.semaphore:
55
+ async with semaphore:
56
+ await fn(**fn_kwargs)
57
+ else:
58
+ await fn(**fn_kwargs)
@@ -0,0 +1,18 @@
1
+ from .chunker import Chunker, ChunkerConfig
2
+ from .embedder import Embedder, EmbedderConfig
3
+ from .filter import Filterer, FiltererConfig
4
+ from .partitioner import Partitioner, PartitionerConfig
5
+ from .uncompress import UncompressConfig, Uncompressor
6
+
7
+ __all__ = [
8
+ "Chunker",
9
+ "ChunkerConfig",
10
+ "Embedder",
11
+ "EmbedderConfig",
12
+ "Filterer",
13
+ "FiltererConfig",
14
+ "Partitioner",
15
+ "PartitionerConfig",
16
+ "Uncompressor",
17
+ "UncompressConfig",
18
+ ]
@@ -0,0 +1,124 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Any, Optional
5
+
6
+ from pydantic import BaseModel, Field, SecretStr
7
+
8
+ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
10
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
11
+ from unstructured_ingest.v2.logger import logger
12
+ from unstructured_ingest.v2.unstructured_api import call_api_async
13
+
14
+ CHUNK_MAX_CHARS_DEFAULT: int = 500
15
+ CHUNK_MULTI_PAGE_DEFAULT: bool = True
16
+
17
+
18
+ class ChunkerConfig(BaseModel):
19
+ chunking_strategy: Optional[str] = Field(
20
+ default=None, description="The rule-set to use to form chunks. Omit to disable chunking."
21
+ )
22
+ chunking_endpoint: Optional[str] = Field(
23
+ default="https://api.unstructuredapp.io/general/v0/general",
24
+ description="If chunking via api, use the following host.",
25
+ )
26
+ chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
27
+ chunk_api_key: Optional[SecretStr] = Field(
28
+ default=None, description="API Key for chunking endpoint."
29
+ )
30
+
31
+ chunk_combine_text_under_n_chars: Optional[int] = Field(
32
+ default=None,
33
+ description="Combine consecutive chunks when the first does not exceed this length and"
34
+ " the second will fit without exceeding the hard-maximum length. Only"
35
+ " operative for 'by_title' chunking-strategy.",
36
+ )
37
+ chunk_include_orig_elements: Optional[bool] = Field(
38
+ default=None,
39
+ description="When chunking, add the original elements consolidated to form each chunk to"
40
+ " `.metadata.orig_elements` on that chunk.",
41
+ )
42
+ chunk_max_characters: int = Field(
43
+ default=CHUNK_MAX_CHARS_DEFAULT,
44
+ description="Hard maximum chunk length. No chunk will exceed this length. An oversized"
45
+ " element will be divided by text-splitting to fit this window.",
46
+ )
47
+ chunk_multipage_sections: bool = Field(
48
+ default=CHUNK_MULTI_PAGE_DEFAULT,
49
+ description="Ignore page boundaries when chunking such that elements from two different"
50
+ " pages can appear in the same chunk. Only operative for 'by_title'"
51
+ " chunking-strategy.",
52
+ )
53
+ chunk_new_after_n_chars: Optional[int] = Field(
54
+ default=None,
55
+ description="Soft-maximum chunk length. Another element will not be added to a chunk of"
56
+ " this length even when it would fit without exceeding the hard-maximum"
57
+ " length.",
58
+ )
59
+ chunk_overlap: Optional[int] = Field(
60
+ default=None,
61
+ description="Prefix chunk text with last overlap=N characters of prior chunk. Only"
62
+ " applies to oversized chunks divided by text-splitting. To apply overlap to"
63
+ " non-oversized chunks use the --overlap-all option.",
64
+ )
65
+ chunk_overlap_all: Optional[bool] = Field(
66
+ default=None,
67
+ description="Apply overlap to chunks formed from whole elements as well as those formed"
68
+ " by text-splitting oversized elements. Overlap length is take from --overlap"
69
+ " option value.",
70
+ )
71
+
72
+ def to_chunking_kwargs(self) -> dict[str, Any]:
73
+ return {
74
+ "chunking_strategy": self.chunking_strategy,
75
+ "combine_under_n_chars": self.chunk_combine_text_under_n_chars,
76
+ "max_characters": self.chunk_max_characters,
77
+ "include_orig_elements": self.chunk_include_orig_elements,
78
+ "multipage_sections": self.chunk_multipage_sections,
79
+ "new_after_n_chars": self.chunk_new_after_n_chars,
80
+ "overlap": self.chunk_overlap,
81
+ "overlap_all": self.chunk_overlap_all,
82
+ }
83
+
84
+
85
+ @dataclass
86
+ class Chunker(BaseProcess, ABC):
87
+ config: ChunkerConfig
88
+
89
+ def is_async(self) -> bool:
90
+ return self.config.chunk_by_api
91
+
92
+ @requires_dependencies(dependencies=["unstructured"])
93
+ def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
94
+ from unstructured.chunking import dispatch
95
+ from unstructured.staging.base import elements_from_json
96
+
97
+ elements = elements_from_json(filename=str(elements_filepath))
98
+ if not elements:
99
+ return [e.to_dict() for e in elements]
100
+ local_chunking_strategies = ("basic", "by_title")
101
+ if self.config.chunking_strategy not in local_chunking_strategies:
102
+ logger.warning(
103
+ "chunking strategy not supported for local chunking: {}, must be one of: {}".format(
104
+ self.config.chunking_strategy, ", ".join(local_chunking_strategies)
105
+ )
106
+ )
107
+ return [e.to_dict() for e in elements]
108
+ chunked_elements = dispatch.chunk(elements=elements, **self.config.to_chunking_kwargs())
109
+ chunked_elements_dicts = [e.to_dict() for e in chunked_elements]
110
+ chunked_elements_dicts = assign_and_map_hash_ids(elements=chunked_elements_dicts)
111
+ return chunked_elements_dicts
112
+
113
+ @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
114
+ async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
115
+ elements = await call_api_async(
116
+ server_url=self.config.chunking_endpoint,
117
+ api_key=self.config.chunk_api_key.get_secret_value(),
118
+ filename=elements_filepath,
119
+ api_parameters=self.config.to_chunking_kwargs(),
120
+ )
121
+
122
+ elements = assign_and_map_hash_ids(elements=elements)
123
+
124
+ return elements
@@ -0,0 +1,69 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from typing import Optional, Type, TypeVar
4
+
5
+ from unstructured_ingest.v2.interfaces import (
6
+ ConnectionConfig,
7
+ Downloader,
8
+ DownloaderConfig,
9
+ Indexer,
10
+ IndexerConfig,
11
+ Uploader,
12
+ UploaderConfig,
13
+ UploadStager,
14
+ UploadStagerConfig,
15
+ )
16
+
17
+ IndexerT = TypeVar("IndexerT", bound=Indexer)
18
+ IndexerConfigT = TypeVar("IndexerConfigT", bound=IndexerConfig)
19
+ DownloaderT = TypeVar("DownloaderT", bound=Downloader)
20
+ DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
21
+ ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
22
+ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
23
+ UploadStagerT = TypeVar("UploadStagerT", bound=UploadStager)
24
+ UploaderConfigT = TypeVar("UploaderConfigT", bound=UploaderConfig)
25
+ UploaderT = TypeVar("UploaderT", bound=Uploader)
26
+
27
+
28
+ @dataclass
29
+ class RegistryEntry(ABC):
30
+ pass
31
+
32
+
33
+ @dataclass
34
+ class SourceRegistryEntry(RegistryEntry):
35
+ indexer: Type[IndexerT]
36
+ downloader: Type[DownloaderT]
37
+
38
+ downloader_config: Optional[Type[DownloaderConfigT]] = None
39
+ indexer_config: Optional[Type[IndexerConfigT]] = None
40
+ connection_config: Optional[Type[ConnectionConfigT]] = None
41
+
42
+
43
+ source_registry: dict[str, SourceRegistryEntry] = {}
44
+
45
+
46
+ def add_source_entry(source_type: str, entry: SourceRegistryEntry):
47
+ if source_type in source_registry:
48
+ raise ValueError(f"source {source_type} has already been registered")
49
+ source_registry[source_type] = entry
50
+
51
+
52
+ @dataclass
53
+ class DestinationRegistryEntry(RegistryEntry):
54
+ uploader: Type[UploaderT]
55
+ upload_stager: Optional[Type[UploadStagerT]] = None
56
+
57
+ upload_stager_config: Optional[Type[UploadStagerConfigT]] = None
58
+ uploader_config: Optional[Type[UploaderConfigT]] = None
59
+
60
+ connection_config: Optional[Type[ConnectionConfigT]] = None
61
+
62
+
63
+ destination_registry: dict[str, DestinationRegistryEntry] = {}
64
+
65
+
66
+ def add_destination_entry(destination_type: str, entry: DestinationRegistryEntry):
67
+ if destination_type in destination_registry:
68
+ raise ValueError(f"destination {destination_type} has already been registered")
69
+ destination_registry[destination_type] = entry