unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,384 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import multiprocessing as mp
6
+ import shutil
7
+ from dataclasses import InitVar, dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
12
+ from unstructured_ingest.v2.logger import logger, make_default_logger
13
+ from unstructured_ingest.v2.otel import OtelHandler
14
+ from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
15
+ from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
16
+ from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
17
+ from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
18
+ from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
19
+ from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
20
+ from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
21
+ from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
22
+ from unstructured_ingest.v2.pipeline.steps.upload import UploadStep
23
+ from unstructured_ingest.v2.processes.chunker import ChunkerConfig
24
+ from unstructured_ingest.v2.processes.connector_registry import (
25
+ ConnectionConfig,
26
+ DownloaderConfigT,
27
+ IndexerConfigT,
28
+ UploaderConfigT,
29
+ UploadStagerConfigT,
30
+ destination_registry,
31
+ source_registry,
32
+ )
33
+ from unstructured_ingest.v2.processes.connectors.local import LocalUploader
34
+ from unstructured_ingest.v2.processes.embedder import EmbedderConfig
35
+ from unstructured_ingest.v2.processes.filter import FiltererConfig
36
+ from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
37
+
38
+
39
+ class PipelineError(Exception):
40
+ pass
41
+
42
+
43
+ @dataclass
44
+ class Pipeline:
45
+ context: ProcessorConfig
46
+
47
+ indexer: InitVar[IndexerT]
48
+ indexer_step: IndexStep = field(init=False)
49
+
50
+ downloader: InitVar[DownloaderT]
51
+ downloader_step: DownloadStep = field(init=False)
52
+
53
+ partitioner: InitVar[Partitioner]
54
+ partitioner_step: PartitionStep = field(init=False)
55
+
56
+ chunker: InitVar[Chunker | None] = None
57
+ chunker_step: ChunkStep | None = field(init=False, default=None)
58
+
59
+ embedder: InitVar[Embedder | None] = None
60
+ embedder_step: EmbedStep | None = field(init=False, default=None)
61
+
62
+ stager: InitVar[UploadStager | None] = None
63
+ stager_step: UploadStageStep | None = field(init=False, default=None)
64
+
65
+ uploader: InitVar[Uploader] = field(default=LocalUploader())
66
+ uploader_step: UploadStep | None = field(init=False, default=None)
67
+
68
+ uncompress_step: UncompressStep | None = field(init=False, default=None)
69
+
70
+ filterer: InitVar[Filterer | None] = None
71
+ filter_step: FilterStep | None = field(init=False, default=None)
72
+
73
+ def __post_init__(
74
+ self,
75
+ indexer: IndexerT,
76
+ downloader: DownloaderT,
77
+ partitioner: Partitioner,
78
+ chunker: Chunker | None = None,
79
+ embedder: Embedder | None = None,
80
+ stager: UploadStager | None = None,
81
+ uploader: Uploader | None = None,
82
+ filterer: Filterer | None = None,
83
+ ):
84
+ make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
85
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
86
+ otel_handler.init_trace()
87
+ self.indexer_step = IndexStep(process=indexer, context=self.context)
88
+ self.downloader_step = DownloadStep(process=downloader, context=self.context)
89
+ self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
90
+ self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
91
+ self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
92
+
93
+ self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
94
+ # TODO: support initialize() call from each step process
95
+ # Potential long call to download embedder models, run before any fanout:
96
+ if embedder and embedder.config:
97
+ embedder.config.get_embedder().initialize()
98
+
99
+ self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
100
+ self.uploader_step = UploadStep(process=uploader, context=self.context)
101
+ if self.context.uncompress:
102
+ process = Uncompressor()
103
+ self.uncompress_step = UncompressStep(process=process, context=self.context)
104
+
105
+ self.check_destination_connector()
106
+
107
+ def check_destination_connector(self):
108
+ # Make sure that if the set destination connector expects a stager, one is also set
109
+ if not self.uploader_step:
110
+ return
111
+ uploader_connector_type = self.uploader_step.process.connector_type
112
+ registry_entry = destination_registry[uploader_connector_type]
113
+ if registry_entry.upload_stager and self.stager_step is None:
114
+ raise ValueError(
115
+ f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
116
+ f"expects a stager of type {registry_entry.upload_stager.__name__} "
117
+ f"but one was not set"
118
+ )
119
+
120
+ def cleanup(self):
121
+ if self.context.delete_cache and Path(self.context.work_dir).exists():
122
+ logger.info(f"deleting cache directory: {self.context.work_dir}")
123
+ shutil.rmtree(self.context.work_dir)
124
+
125
+ def log_statuses(self):
126
+ if status := self.context.status:
127
+ logger.error(f"{len(status)} failed documents:")
128
+ for k, v in status.items():
129
+ for kk, vv in v.items():
130
+ logger.error(f"{k}: [{kk}] {vv}")
131
+
132
+ def run(self):
133
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
134
+ try:
135
+ with otel_handler.get_tracer().start_as_current_span(
136
+ "ingest process", record_exception=True
137
+ ):
138
+ self._run_prechecks()
139
+ self._run()
140
+ finally:
141
+ self.log_statuses()
142
+ self.cleanup()
143
+ if self.context.status:
144
+ raise PipelineError("Pipeline did not run successfully")
145
+
146
+ def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
147
+ if not results:
148
+ return None
149
+ results = [r for r in results if r]
150
+ flat = []
151
+ for r in results:
152
+ if isinstance(r, list):
153
+ flat.extend(r)
154
+ else:
155
+ flat.append(r)
156
+ final = [f for f in flat if f]
157
+ return final or None
158
+
159
+ def _run_prechecks(self):
160
+ steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
161
+ if self.chunker_step:
162
+ steps.append(self.chunker_step)
163
+ if self.embedder_step:
164
+ steps.append(self.embedder_step)
165
+ if self.uncompress_step:
166
+ steps.append(self.uncompress_step)
167
+ if self.stager_step:
168
+ steps.append(self.stager_step)
169
+ failures = {}
170
+ for step in steps:
171
+ try:
172
+ step.process.precheck()
173
+ except Exception as e:
174
+ failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
175
+ if failures:
176
+ for k, v in failures.items():
177
+ logger.error(f"Step precheck failure: {k}: {v}")
178
+ raise PipelineError("Precheck failed")
179
+
180
+ def apply_filter(self, records: list[dict]) -> list[dict]:
181
+ if not self.filter_step:
182
+ return records
183
+ data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
184
+ filtered_data = self.filter_step(data_to_filter)
185
+ filtered_data = [f for f in filtered_data if f is not None]
186
+ filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
187
+ filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
188
+ return filtered_records
189
+
190
+ def get_indices(self) -> list[dict]:
191
+ if self.indexer_step.process.is_async():
192
+ indices = asyncio.run(self.indexer_step.run_async())
193
+ else:
194
+ indices = self.indexer_step.run()
195
+ indices_inputs = [{"file_data_path": i} for i in indices]
196
+ return indices_inputs
197
+
198
+ def _run(self):
199
+ logger.info(
200
+ f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
201
+ )
202
+ if self.context.mp_supported:
203
+ manager = mp.Manager()
204
+ self.context.status = manager.dict()
205
+ else:
206
+ self.context.status = {}
207
+
208
+ # Index into data source
209
+ indices_inputs = self.get_indices()
210
+ if not indices_inputs:
211
+ logger.info("No files to process after indexer, exiting")
212
+ return
213
+
214
+ # Initial filtering on indexed content
215
+ indices_inputs = self.apply_filter(records=indices_inputs)
216
+ if not indices_inputs:
217
+ logger.info("No files to process after filtering indexed content, exiting")
218
+ return
219
+
220
+ # Download associated content to local file system
221
+ downloaded_data = self.downloader_step(indices_inputs)
222
+ downloaded_data = self.clean_results(results=downloaded_data)
223
+ if not downloaded_data:
224
+ logger.info("No files to process after downloader, exiting")
225
+ return
226
+
227
+ # Post download filtering
228
+ downloaded_data = self.apply_filter(records=downloaded_data)
229
+ if not downloaded_data:
230
+ logger.info("No files to process after filtering downloaded content, exiting")
231
+ return
232
+
233
+ # Run uncompress if available
234
+ if self.uncompress_step:
235
+ downloaded_data = self.uncompress_step(downloaded_data)
236
+ # Flatten list of lists
237
+ downloaded_data = self.clean_results(results=downloaded_data)
238
+
239
+ # Post uncompress filtering
240
+ downloaded_data = self.apply_filter(records=downloaded_data)
241
+ if not downloaded_data:
242
+ logger.info("No files to process after filtering uncompressed content, exiting")
243
+ return
244
+
245
+ if not downloaded_data or self.context.download_only:
246
+ return
247
+
248
+ # Partition content
249
+ elements = self.partitioner_step(downloaded_data)
250
+ # Download data non longer needed, delete if possible
251
+ self.downloader_step.delete_cache()
252
+ elements = self.clean_results(results=elements)
253
+ if not elements:
254
+ logger.info("No files to process after partitioning, exiting")
255
+ return
256
+
257
+ # Run element specific modifiers
258
+ last_step = self.partitioner_step
259
+ for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
260
+ elements = step(elements)
261
+ elements = self.clean_results(results=elements)
262
+ # Delete data from previous step if possible since no longer needed
263
+ last_step.delete_cache()
264
+ last_step = step
265
+ if not elements:
266
+ logger.info(f"no files to process after {step.__class__.__name__}, exiting")
267
+ return
268
+
269
+ # Upload the final result
270
+ self.uploader_step(iterable=elements)
271
+ last_step.delete_cache()
272
+
273
+ def __str__(self):
274
+ s = [str(self.indexer_step)]
275
+ if filter_step := self.filter_step:
276
+ s.append(str(filter_step))
277
+ s.append(str(self.downloader_step))
278
+ if filter_step := self.filter_step:
279
+ s.append(str(filter_step))
280
+ if uncompress_step := self.uncompress_step:
281
+ s.extend([str(uncompress_step), str(filter_step)])
282
+ s.append(str(self.partitioner_step))
283
+ if chunker_step := self.chunker_step:
284
+ s.append(str(chunker_step))
285
+ if embedder_step := self.embedder_step:
286
+ s.append(str(embedder_step))
287
+ if stager_step := self.stager_step:
288
+ s.append(str(stager_step))
289
+ s.append(str(self.uploader_step))
290
+ return " -> ".join(s)
291
+
292
+ @classmethod
293
+ def from_configs(
294
+ cls,
295
+ context: ProcessorConfig,
296
+ indexer_config: IndexerConfigT,
297
+ downloader_config: DownloaderConfigT,
298
+ source_connection_config: ConnectionConfig,
299
+ partitioner_config: PartitionerConfig,
300
+ filterer_config: FiltererConfig | None = None,
301
+ chunker_config: ChunkerConfig | None = None,
302
+ embedder_config: EmbedderConfig | None = None,
303
+ destination_connection_config: ConnectionConfig | None = None,
304
+ stager_config: UploadStagerConfigT | None = None,
305
+ uploader_config: UploaderConfigT | None = None,
306
+ ) -> "Pipeline":
307
+ # Get registry key based on indexer config
308
+ source_entry = {
309
+ k: v
310
+ for k, v in source_registry.items()
311
+ if isinstance(indexer_config, v.indexer_config)
312
+ and isinstance(downloader_config, v.downloader_config)
313
+ and isinstance(source_connection_config, v.connection_config)
314
+ }
315
+ if len(source_entry) > 1:
316
+ raise ValueError(
317
+ f"multiple entries found matching provided indexer, "
318
+ f"downloader and connection configs: {source_entry}"
319
+ )
320
+ if len(source_entry) != 1:
321
+ raise ValueError(
322
+ "no entry found in source registry with matching indexer, "
323
+ "downloader and connection configs"
324
+ )
325
+ source = list(source_entry.values())[0]
326
+ pipeline_kwargs = {
327
+ "context": context,
328
+ "indexer": source.indexer(
329
+ index_config=indexer_config, connection_config=source_connection_config
330
+ ),
331
+ "downloader": source.downloader(
332
+ download_config=downloader_config, connection_config=source_connection_config
333
+ ),
334
+ "partitioner": Partitioner(config=partitioner_config),
335
+ }
336
+ if filterer_config:
337
+ pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
338
+ if chunker_config:
339
+ pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
340
+ if embedder_config:
341
+ pipeline_kwargs["embedder"] = Embedder(config=embedder_config)
342
+ if not uploader_config:
343
+ return Pipeline(**pipeline_kwargs)
344
+
345
+ destination_entry = {
346
+ k: v
347
+ for k, v in destination_registry.items()
348
+ if isinstance(uploader_config, v.uploader_config)
349
+ }
350
+ if destination_connection_config:
351
+ destination_entry = {
352
+ k: v
353
+ for k, v in destination_entry.items()
354
+ if isinstance(destination_connection_config, v.connection_config)
355
+ }
356
+ if stager_config:
357
+ destination_entry = {
358
+ k: v
359
+ for k, v in destination_entry.items()
360
+ if isinstance(stager_config, v.upload_stager_config)
361
+ }
362
+
363
+ if len(destination_entry) > 1:
364
+ raise ValueError(
365
+ f"multiple entries found matching provided uploader, "
366
+ f"stager and connection configs: {destination_entry}"
367
+ )
368
+ if len(destination_entry) != 1:
369
+ raise ValueError(
370
+ "no entry found in destination registry with matching uploader, "
371
+ "stager and connection configs"
372
+ )
373
+
374
+ destination = list(destination_entry.values())[0]
375
+ if stager_config:
376
+ pipeline_kwargs["stager"] = destination.upload_stager(
377
+ upload_stager_config=stager_config
378
+ )
379
+ if uploader_config:
380
+ uploader_kwargs = {"upload_config": uploader_config}
381
+ if destination_connection_config:
382
+ uploader_kwargs["connection_config"] = destination_connection_config
383
+ pipeline_kwargs["uploader"] = destination.uploader(**uploader_kwargs)
384
+ return cls(**pipeline_kwargs)
File without changes
@@ -0,0 +1,80 @@
1
+ import asyncio
2
+ import hashlib
3
+ import json
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Callable, Optional, TypedDict
7
+
8
+ from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
+ from unstructured_ingest.v2.logger import logger
11
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
+ from unstructured_ingest.v2.processes.chunker import Chunker
13
+ from unstructured_ingest.v2.utils import serialize_base_model_json
14
+
15
+ STEP_ID = "chunk"
16
+
17
+
18
+ class ChunkStepResponse(TypedDict):
19
+ file_data_path: str
20
+ path: str
21
+
22
+
23
+ @dataclass
24
+ class ChunkStep(PipelineStep):
25
+ process: Chunker
26
+ identifier: str = STEP_ID
27
+
28
+ def __str__(self):
29
+ return f"{self.identifier} ({self.process.config.chunking_strategy})"
30
+
31
+ def __post_init__(self):
32
+ config = self.process.config.model_dump_json() if self.process.config else None
33
+ logger.info(f"created {self.identifier} with configs: {config}")
34
+
35
+ def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
36
+ if self.context.reprocess or file_data.reprocess:
37
+ return True
38
+ return not filepath.exists()
39
+
40
+ def get_output_filepath(self, filename: Path) -> Path:
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
42
+ filepath = (self.cache_dir / hashed_output_file).resolve()
43
+ filepath.parent.mkdir(parents=True, exist_ok=True)
44
+ return filepath
45
+
46
+ def _save_output(self, output_filepath: str, chunked_content: list[dict]):
47
+ with open(str(output_filepath), "w") as f:
48
+ logger.debug(f"writing chunker output to: {output_filepath}")
49
+ json.dump(chunked_content, f, indent=2)
50
+
51
+ async def _run_async(
52
+ self, fn: Callable, path: str, file_data_path: str, **kwargs
53
+ ) -> ChunkStepResponse:
54
+ path = Path(path)
55
+ file_data = file_data_from_file(path=file_data_path)
56
+ output_filepath = self.get_output_filepath(filename=path)
57
+ if not self.should_chunk(filepath=output_filepath, file_data=file_data):
58
+ logger.debug(f"skipping chunking, output already exists: {output_filepath}")
59
+ return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
60
+ fn_kwargs = {"elements_filepath": path}
61
+ if not asyncio.iscoroutinefunction(fn):
62
+ chunked_content_raw = fn(**fn_kwargs)
63
+ elif semaphore := self.context.semaphore:
64
+ async with semaphore:
65
+ chunked_content_raw = await fn(**fn_kwargs)
66
+ else:
67
+ chunked_content_raw = await fn(**fn_kwargs)
68
+ self._save_output(
69
+ output_filepath=str(output_filepath),
70
+ chunked_content=chunked_content_raw,
71
+ )
72
+ return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
73
+
74
+ def get_hash(self, extras: Optional[list[str]]) -> str:
75
+ hashable_string = serialize_base_model_json(
76
+ model=self.process.config, sort_keys=True, ensure_ascii=True
77
+ )
78
+ if extras:
79
+ hashable_string += "".join(extras)
80
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -0,0 +1,207 @@
1
+ import asyncio
2
+ import hashlib
3
+ import json
4
+ import shutil
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Callable, Optional, TypedDict, TypeVar
8
+
9
+ from unstructured_ingest.v2.interfaces import FileData, download_responses
10
+ from unstructured_ingest.v2.interfaces.downloader import Downloader
11
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
12
+ from unstructured_ingest.v2.logger import logger
13
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
14
+ from unstructured_ingest.v2.utils import serialize_base_model_json
15
+
16
+ DownloaderT = TypeVar("DownloaderT", bound=Downloader)
17
+
18
+ STEP_ID = "download"
19
+
20
+
21
+ class DownloadStepResponse(TypedDict):
22
+ file_data_path: str
23
+ path: str
24
+
25
+
26
+ @dataclass
27
+ class DownloadStep(PipelineStep):
28
+ process: DownloaderT
29
+ identifier: str = STEP_ID
30
+
31
+ def __str__(self):
32
+ return f"{self.identifier} ({self.process.__class__.__name__})"
33
+
34
+ def __post_init__(self):
35
+ config = (
36
+ self.process.download_config.model_dump_json() if self.process.download_config else None
37
+ )
38
+ connection_config = (
39
+ self.process.connection_config.model_dump_json()
40
+ if self.process.connection_config
41
+ else None
42
+ )
43
+ logger.info(
44
+ f"Created {self.identifier} with configs: {config}, "
45
+ f"connection configs: {connection_config}"
46
+ )
47
+
48
+ @staticmethod
49
+ def is_float(value: str):
50
+ try:
51
+ float(value)
52
+ return True
53
+ except ValueError:
54
+ return False
55
+
56
+ def should_download(self, file_data: FileData, file_data_path: str) -> bool:
57
+ if self.context.re_download:
58
+ return True
59
+ download_path = self.process.get_download_path(file_data=file_data)
60
+ if not download_path or not download_path.exists():
61
+ return True
62
+ if (
63
+ download_path.is_file()
64
+ and file_data.metadata.date_modified
65
+ and self.is_float(file_data.metadata.date_modified)
66
+ and download_path.stat().st_mtime > float(file_data.metadata.date_modified)
67
+ ):
68
+ # Also update file data to mark this to reprocess since this won't change the filename
69
+ file_data.reprocess = True
70
+ file_data.to_file(path=file_data_path)
71
+ return True
72
+ return False
73
+
74
+ def update_file_data(
75
+ self, file_data: FileData, file_data_path: Path, download_path: Path
76
+ ) -> None:
77
+ file_data.local_download_path = str(download_path.resolve())
78
+ file_size_bytes = download_path.stat().st_size
79
+ if not file_data.metadata.filesize_bytes and file_size_bytes:
80
+ file_data.metadata.filesize_bytes = file_size_bytes
81
+ if (
82
+ file_data.metadata.filesize_bytes
83
+ and file_data.metadata.filesize_bytes != file_size_bytes
84
+ ):
85
+ logger.warning(
86
+ f"file size in original file data "
87
+ f"({file_data.metadata.filesize_bytes}) doesn't "
88
+ f"match size of local file: {file_size_bytes}, updating"
89
+ )
90
+ file_data.metadata.filesize_bytes = file_size_bytes
91
+ logger.debug(f"updating file data with new content: {file_data.model_dump()}")
92
+ with file_data_path.open("w") as file:
93
+ json.dump(file_data.model_dump(), file, indent=2)
94
+
95
+ async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
96
+ file_data = file_data_from_file(path=file_data_path)
97
+ download_path = self.process.get_download_path(file_data=file_data)
98
+ if not self.should_download(file_data=file_data, file_data_path=file_data_path):
99
+ logger.debug(f"skipping download, file already exists locally: {download_path}")
100
+ self.update_file_data(
101
+ file_data=file_data,
102
+ file_data_path=Path(file_data_path),
103
+ download_path=download_path,
104
+ )
105
+ return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
106
+ fn_kwargs = {"file_data": file_data}
107
+ if not asyncio.iscoroutinefunction(fn):
108
+ download_results = fn(**fn_kwargs)
109
+ elif semaphore := self.context.semaphore:
110
+ async with semaphore:
111
+ download_results = await fn(**fn_kwargs)
112
+ else:
113
+ download_results = await fn(**fn_kwargs)
114
+ return self.create_step_results(
115
+ current_file_data_path=file_data_path,
116
+ download_results=download_results,
117
+ current_file_data=file_data,
118
+ )
119
+
120
+ def create_step_results(
121
+ self,
122
+ current_file_data_path: str,
123
+ current_file_data: FileData,
124
+ download_results: download_responses,
125
+ ) -> list[DownloadStepResponse]:
126
+ responses = []
127
+ if not isinstance(download_results, list):
128
+ file_data = current_file_data
129
+ file_data_path = current_file_data_path
130
+ download_path = download_results["path"]
131
+ if download_results["file_data"].identifier == current_file_data.identifier:
132
+ self.update_file_data(
133
+ file_data=file_data,
134
+ file_data_path=Path(file_data_path),
135
+ download_path=download_path,
136
+ )
137
+ responses = [
138
+ DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
139
+ ]
140
+ else:
141
+ file_data = download_results["file_data"]
142
+ file_data_path = self.persist_new_file_data(file_data=file_data)
143
+ self.update_file_data(
144
+ file_data=file_data,
145
+ file_data_path=Path(file_data_path),
146
+ download_path=download_path,
147
+ )
148
+ responses = [
149
+ DownloadStepResponse(
150
+ file_data_path=current_file_data_path, path=str(download_results["path"])
151
+ )
152
+ ]
153
+ else:
154
+ # Supplemental results generated as part of the download process
155
+ for res in download_results:
156
+ file_data = res["file_data"]
157
+ file_data_path = self.persist_new_file_data(file_data=file_data)
158
+ download_path = res["path"]
159
+ self.update_file_data(
160
+ file_data=file_data,
161
+ file_data_path=Path(file_data_path),
162
+ download_path=download_path,
163
+ )
164
+ responses.append(
165
+ DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
166
+ )
167
+
168
+ return responses
169
+
170
+ def persist_new_file_data(self, file_data: FileData) -> str:
171
+ record_hash = self.get_hash(extras=[file_data.identifier])
172
+ filename = f"{record_hash}.json"
173
+ filepath = (self.cache_dir / filename).resolve()
174
+ filepath.parent.mkdir(parents=True, exist_ok=True)
175
+ with open(str(filepath), "w") as f:
176
+ json.dump(file_data.model_dump(), f, indent=2)
177
+ return str(filepath)
178
+
179
+ def get_hash(self, extras: Optional[list[str]]) -> str:
180
+ download_config_dict = json.loads(
181
+ serialize_base_model_json(model=self.process.download_config)
182
+ )
183
+ connection_config_dict = json.loads(
184
+ serialize_base_model_json(model=self.process.connection_config)
185
+ )
186
+ hashable_dict = {
187
+ "download_config": download_config_dict,
188
+ "connection_config": connection_config_dict,
189
+ }
190
+ hashable_string = json.dumps(hashable_dict, sort_keys=True)
191
+ if extras:
192
+ hashable_string += "".join(extras)
193
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
194
+
195
+ @property
196
+ def cache_dir(self) -> Path:
197
+ return self.process.download_config.download_dir
198
+
199
+ def delete_cache(self):
200
+ if (
201
+ self.context.iter_delete
202
+ and not self.context.preserve_downloads
203
+ and self.cache_dir.exists()
204
+ ):
205
+ cache_dir = self.cache_dir
206
+ logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
207
+ shutil.rmtree(cache_dir)