unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,130 @@
1
+ import ast
2
+ import json
3
+ import logging
4
+ import typing as t
5
+
6
+ logger = logging.getLogger("unstructured_ingest")
7
+
8
+
9
+ def default_is_data_sensitive(k: str, v: t.Any) -> bool:
10
+ sensitive_fields = [
11
+ "account_name",
12
+ "client_id",
13
+ ]
14
+ sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
15
+ return (
16
+ v
17
+ and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
18
+ or k.lower() in sensitive_fields
19
+ )
20
+
21
+
22
+ def hide_sensitive_fields(
23
+ data: dict, is_sensitive_fn: t.Callable[[str, t.Any], bool] = default_is_data_sensitive
24
+ ) -> dict:
25
+ """
26
+ Will recursively look through every k, v pair in this dict and any nested ones and run
27
+ is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
28
+ any string value can be parsed as valid json and process that dict as well and replace
29
+ the original string with the json.dumps() version of the redacted dict.
30
+ """
31
+ new_data = data.copy()
32
+ for k, v in new_data.items():
33
+ if is_sensitive_fn(k, v):
34
+ new_data[k] = "*******"
35
+ if isinstance(v, dict):
36
+ new_data[k] = hide_sensitive_fields(v)
37
+ if isinstance(v, str):
38
+ # Need to take into account strings generated via json.dumps() or simply printing a dict
39
+ try:
40
+ json_data = json.loads(v)
41
+ if isinstance(json_data, dict):
42
+ updated_data = hide_sensitive_fields(json_data)
43
+ new_data[k] = json.dumps(updated_data)
44
+ except json.JSONDecodeError:
45
+ pass
46
+
47
+ return new_data
48
+
49
+
50
+ def redact_jsons(s: str) -> str:
51
+ """
52
+ Takes in a generic string and pulls out all valid json content. Leverages
53
+ hide_sensitive_fields() to redact any sensitive information and replaces the
54
+ original json with the new redacted format. There can be any number of valid
55
+ jsons in a generic string and this will work. Having extra '{' without a
56
+ closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
57
+
58
+ """
59
+ chars = list(s)
60
+ if "{" not in chars:
61
+ return s
62
+ i = 0
63
+ jsons = []
64
+ i = 0
65
+ while i < len(chars):
66
+ char = chars[i]
67
+ if char == "{":
68
+ stack = [char]
69
+ current = [char]
70
+ while len(stack) != 0 and i < len(chars):
71
+ i += 1
72
+ char = chars[i]
73
+ current.append(char)
74
+ if char == "{":
75
+ stack.append(char)
76
+ if char == "}":
77
+ stack.pop(-1)
78
+ jsons.append("".join(current))
79
+ continue
80
+ i += 1
81
+ for j in jsons:
82
+ try:
83
+ formatted_j = json.dumps(json.loads(j))
84
+ except json.JSONDecodeError:
85
+ formatted_j = json.dumps(ast.literal_eval(j))
86
+ hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
87
+ s = s.replace(j, hidden_j)
88
+ return s
89
+
90
+
91
+ class SensitiveFormatter(logging.Formatter):
92
+ def format(self, record):
93
+ s = super().format(record=record)
94
+ return redact_jsons(s)
95
+
96
+
97
+ def remove_root_handlers(logger: logging.Logger) -> None:
98
+ # NOTE(robinson): in some environments such as Google Colab, there is a root handler
99
+ # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
100
+ # Removing these when they exist prevents this behavior
101
+ if logger.root.hasHandlers():
102
+ for handler in logger.root.handlers:
103
+ logger.root.removeHandler(handler)
104
+
105
+
106
+ def ingest_log_streaming_init(level: int) -> None:
107
+ handler = logging.StreamHandler()
108
+ handler.name = "ingest_log_handler"
109
+ formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
110
+ handler.setFormatter(formatter)
111
+
112
+ # Only want to add the handler once
113
+ if "ingest_log_handler" not in [h.name for h in logger.handlers]:
114
+ logger.addHandler(handler)
115
+
116
+ remove_root_handlers(logger)
117
+ logger.setLevel(level)
118
+
119
+
120
+ def make_default_logger(level: int) -> logging.Logger:
121
+ """Return a custom logger."""
122
+ logger = logging.getLogger("unstructured_ingest")
123
+ handler = logging.StreamHandler()
124
+ handler.name = "ingest_log_handler"
125
+ formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
126
+ handler.setFormatter(formatter)
127
+ logger.addHandler(handler)
128
+ logger.setLevel(level)
129
+ remove_root_handlers(logger)
130
+ return logger
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env python3
2
+ from unstructured_ingest.cli.cli import get_cmd
3
+
4
+
5
+ def main():
6
+ ingest_cmd = get_cmd()
7
+ ingest_cmd()
8
+
9
+
10
+ if __name__ == "__main__":
11
+ main()
@@ -0,0 +1,22 @@
1
+ from .doc_factory import DocFactory
2
+ from .interfaces import PipelineContext, ReformatNode
3
+ from .partition import Partitioner
4
+ from .permissions import PermissionsDataCleaner
5
+ from .pipeline import Pipeline
6
+ from .reformat.chunking import Chunker
7
+ from .reformat.embedding import Embedder
8
+ from .source import Reader
9
+ from .write import Writer
10
+
11
+ __all__ = [
12
+ "DocFactory",
13
+ "Partitioner",
14
+ "Reader",
15
+ "Embedder",
16
+ "PipelineContext",
17
+ "Pipeline",
18
+ "Writer",
19
+ "Chunker",
20
+ "ReformatNode",
21
+ "PermissionsDataCleaner",
22
+ ]
@@ -0,0 +1,19 @@
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.pipeline.interfaces import CopyNode
8
+
9
+
10
+ class Copier(CopyNode):
11
+ def run(self, json_path: str):
12
+ filename = os.path.basename(json_path)
13
+ doc_hash = os.path.splitext(filename)[0]
14
+ ingest_doc_dict = self.pipeline_context.ingest_docs_map[doc_hash]
15
+ ingest_doc = create_ingest_doc_from_dict(ingest_doc_dict)
16
+ desired_output = ingest_doc._output_filename
17
+ Path(desired_output).parent.mkdir(parents=True, exist_ok=True)
18
+ logger.info(f"copying {json_path} -> {desired_output}")
19
+ shutil.copy(json_path, desired_output)
@@ -0,0 +1,12 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.pipeline.interfaces import DocFactoryNode
5
+
6
+
7
+ @dataclass
8
+ class DocFactory(DocFactoryNode):
9
+ def run(self, *args, **kwargs) -> t.Iterable[dict]:
10
+ docs = self.source_doc_connector.get_ingest_docs()
11
+ json_docs = [doc.to_dict() for doc in docs]
12
+ return json_docs
@@ -0,0 +1,270 @@
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import multiprocessing as mp
5
+ import typing as t
6
+ from abc import ABC, abstractmethod
7
+ from dataclasses import dataclass, field
8
+ from multiprocessing.managers import DictProxy
9
+ from pathlib import Path
10
+
11
+ from dataclasses_json import DataClassJsonMixin
12
+
13
+ from unstructured_ingest.error import SourceConnectionNetworkError
14
+ from unstructured_ingest.interfaces import (
15
+ BaseDestinationConnector,
16
+ BaseSourceConnector,
17
+ PartitionConfig,
18
+ ProcessorConfig,
19
+ ReadConfig,
20
+ RetryStrategyConfig,
21
+ )
22
+ from unstructured_ingest.logger import ingest_log_streaming_init, logger
23
+
24
+ if t.TYPE_CHECKING:
25
+ from unstructured_ingest.ingest_backoff import RetryHandler
26
+
27
+
28
+ @dataclass
29
+ class PipelineContext(ProcessorConfig):
30
+ """
31
+ Data that gets shared across each pipeline node
32
+ """
33
+
34
+ def __post_init__(self):
35
+ self._ingest_docs_map: t.Optional[DictProxy] = None
36
+
37
+ @property
38
+ def ingest_docs_map(self) -> DictProxy:
39
+ if self._ingest_docs_map is None:
40
+ raise ValueError("ingest_docs_map never initialized")
41
+ return self._ingest_docs_map
42
+
43
+ @ingest_docs_map.setter
44
+ def ingest_docs_map(self, value: DictProxy):
45
+ self._ingest_docs_map = value
46
+
47
+
48
+ @dataclass
49
+ class PipelineNode(DataClassJsonMixin, ABC):
50
+ """
51
+ Class that encapsulates logic to run during a single pipeline step
52
+ """
53
+
54
+ pipeline_context: PipelineContext
55
+
56
+ def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any:
57
+ iterable = iterable if iterable else []
58
+ if iterable:
59
+ logger.info(
60
+ f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
61
+ )
62
+
63
+ self.initialize()
64
+ if not self.supported_multiprocessing():
65
+ if iterable:
66
+ self.result = self.run(iterable)
67
+ else:
68
+ self.result = self.run()
69
+ elif self.pipeline_context.num_processes == 1:
70
+ if iterable:
71
+ self.result = [self.run(it) for it in iterable]
72
+ else:
73
+ self.result = self.run()
74
+ else:
75
+ with mp.Pool(
76
+ processes=self.pipeline_context.num_processes,
77
+ initializer=ingest_log_streaming_init,
78
+ initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,),
79
+ ) as pool:
80
+ self.result = pool.map(self.run, iterable)
81
+ # Remove None which may be caused by failed docs that didn't raise an error
82
+ if isinstance(self.result, t.Iterable):
83
+ self.result = [r for r in self.result if r is not None]
84
+ return self.result
85
+
86
+ def supported_multiprocessing(self) -> bool:
87
+ return True
88
+
89
+ @abstractmethod
90
+ def run(self, *args, **kwargs) -> t.Optional[t.Any]:
91
+ pass
92
+
93
+ def initialize(self):
94
+ if path := self.get_path():
95
+ logger.info(f"creating {path}")
96
+ path.mkdir(parents=True, exist_ok=True)
97
+ ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
98
+
99
+ def get_path(self) -> t.Optional[Path]:
100
+ return None
101
+
102
+
103
+ @dataclass
104
+ class DocFactoryNode(PipelineNode):
105
+ """
106
+ Encapsulated logic to generate a list of ingest docs
107
+ """
108
+
109
+ source_doc_connector: BaseSourceConnector
110
+
111
+ def initialize(self):
112
+ logger.info(
113
+ f"Running doc factory to generate ingest docs. "
114
+ f"Source connector: {self.source_doc_connector.to_json()}",
115
+ )
116
+ super().initialize()
117
+ self.source_doc_connector.initialize()
118
+
119
+ @abstractmethod
120
+ def run(self, *args, **kwargs) -> t.Iterable[dict]:
121
+ pass
122
+
123
+ def supported_multiprocessing(self) -> bool:
124
+ return False
125
+
126
+
127
+ @dataclass
128
+ class SourceNode(PipelineNode):
129
+ """A pipeline node representing logic to pull data from a source using base ingest documents.
130
+
131
+ This class encapsulates the logic for pulling data from a specified source using base ingest
132
+ documents. The output of this logic is expected to be in JSON format representing the data
133
+ itself.
134
+
135
+ Attributes:
136
+ read_config: A configuration object specifying how to read data from the source.
137
+ retry_strategy_config: Optional configuration specifying the strategy for network errors.
138
+
139
+ Properties:
140
+ retry_strategy: A retry handler configured based on the retry strategy configuration.
141
+
142
+ Methods:
143
+ initialize: Initializes the source node and logs the process.
144
+ run: Abstract method for downloading data associated with ingest documents.
145
+ """
146
+
147
+ read_config: ReadConfig
148
+ retry_strategy_config: t.Optional[RetryStrategyConfig] = None
149
+
150
+ @property
151
+ def retry_strategy(self) -> t.Optional["RetryHandler"]:
152
+ if retry_strategy_config := self.retry_strategy_config:
153
+ import backoff
154
+
155
+ from unstructured_ingest.ingest_backoff import RetryHandler
156
+
157
+ return RetryHandler(
158
+ backoff.expo,
159
+ SourceConnectionNetworkError,
160
+ max_time=retry_strategy_config.max_retry_time,
161
+ max_tries=retry_strategy_config.max_retries,
162
+ logger=logger,
163
+ start_log_level=logger.level,
164
+ backoff_log_level=logger.level,
165
+ )
166
+ return None
167
+
168
+ def initialize(self):
169
+ logger.info("Running source node to download data associated with ingest docs")
170
+ super().initialize()
171
+
172
+ @abstractmethod
173
+ def run(self, ingest_doc_json: str) -> t.Optional[str]:
174
+ pass
175
+
176
+
177
+ @dataclass
178
+ class PartitionNode(PipelineNode):
179
+ """
180
+ Encapsulates logic to run partition on the json files as the output of the source node
181
+ """
182
+
183
+ partition_config: PartitionConfig
184
+ partition_kwargs: dict = field(default_factory=dict)
185
+
186
+ def initialize(self):
187
+ logger.info(
188
+ f"Running partition node to extract content from json files. "
189
+ f"Config: {self.partition_config.to_json()}, "
190
+ f"partition kwargs: {json.dumps(self.partition_kwargs)}]",
191
+ )
192
+ super().initialize()
193
+
194
+ def create_hash(self) -> str:
195
+ hash_dict = self.partition_config.to_dict()
196
+ hash_dict["partition_kwargs"] = self.partition_kwargs
197
+ return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
198
+
199
+ @abstractmethod
200
+ def run(self, json_path: str) -> t.Optional[str]:
201
+ pass
202
+
203
+ def get_path(self) -> Path:
204
+ return (Path(self.pipeline_context.work_dir) / "partitioned").resolve()
205
+
206
+
207
+ @dataclass
208
+ class ReformatNode(PipelineNode, ABC):
209
+ """
210
+ Encapsulated any logic to reformat the output List[Element]
211
+ content from partition before writing it
212
+ """
213
+
214
+ @abstractmethod
215
+ def run(self, elements_json: str) -> t.Optional[str]:
216
+ pass
217
+
218
+
219
+ @dataclass
220
+ class WriteNode(PipelineNode):
221
+ """
222
+ Encapsulated logic to write the final result to a downstream data connection
223
+ """
224
+
225
+ dest_doc_connector: BaseDestinationConnector
226
+
227
+ @abstractmethod
228
+ def run(self, json_paths: t.List[str]):
229
+ pass
230
+
231
+ def initialize(self):
232
+ logger.info(
233
+ f"Running write node to upload content. "
234
+ f"Destination connector: {self.dest_doc_connector.to_json(redact_sensitive=True)}]",
235
+ )
236
+ super().initialize()
237
+ self.dest_doc_connector.initialize()
238
+
239
+ def supported_multiprocessing(self) -> bool:
240
+ return False
241
+
242
+
243
+ @dataclass
244
+ class CopyNode(PipelineNode):
245
+ """
246
+ Encapsulated logic to copy the final result of the pipeline to the designated output location.
247
+ """
248
+
249
+ def initialize(self):
250
+ logger.info("Running copy node to move content to desired output location")
251
+ super().initialize()
252
+
253
+ @abstractmethod
254
+ def run(self, json_path: str):
255
+ pass
256
+
257
+
258
+ @dataclass
259
+ class PermissionsNode(PipelineNode):
260
+ """
261
+ Encapsulated logic to do operations on permissions related data.
262
+ """
263
+
264
+ def initialize(self):
265
+ logger.info("Running permissions node to cleanup the permissions folder")
266
+ super().initialize()
267
+
268
+ @abstractmethod
269
+ def run(self):
270
+ pass
@@ -0,0 +1,60 @@
1
+ import hashlib
2
+ import json
3
+ import typing as t
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
9
+ from unstructured_ingest.error import PartitionError
10
+ from unstructured_ingest.logger import logger
11
+ from unstructured_ingest.pipeline.interfaces import PartitionNode
12
+ from unstructured_ingest.pipeline.utils import get_ingest_doc_hash
13
+
14
+
15
+ @dataclass
16
+ class Partitioner(PartitionNode):
17
+ @PartitionError.wrap
18
+ def run(self, ingest_doc_dict) -> Optional[str]:
19
+ try:
20
+ doc = create_ingest_doc_from_dict(ingest_doc_dict)
21
+ doc_filename_hash = get_ingest_doc_hash(ingest_doc_dict)
22
+ hashed_filename = hashlib.sha256(
23
+ f"{self.create_hash()}{doc_filename_hash}".encode(),
24
+ ).hexdigest()[:32]
25
+ self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_dict
26
+ doc_filename = f"{hashed_filename}.json"
27
+ json_path = (Path(self.get_path()) / doc_filename).resolve()
28
+ if (
29
+ not self.pipeline_context.reprocess
30
+ and json_path.is_file()
31
+ and json_path.stat().st_size
32
+ ):
33
+ logger.info(f"file exists: {json_path}, skipping partition")
34
+ return str(json_path)
35
+ partition_kwargs: t.Dict[str, t.Any] = {
36
+ "strategy": self.partition_config.strategy,
37
+ "encoding": self.partition_config.encoding,
38
+ "pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
39
+ "languages": self.partition_config.ocr_languages,
40
+ "hi_res_model_name": self.partition_config.hi_res_model_name,
41
+ }
42
+ if self.partition_config.skip_infer_table_types:
43
+ partition_kwargs["skip_infer_table_types"] = (
44
+ self.partition_config.skip_infer_table_types
45
+ )
46
+ if self.partition_config.additional_partition_args:
47
+ partition_kwargs.update(self.partition_config.additional_partition_args)
48
+ elements = doc.process_file(
49
+ partition_config=self.partition_config,
50
+ **partition_kwargs,
51
+ )
52
+ with open(json_path, "w", encoding="utf8") as output_f:
53
+ logger.info(f"writing partitioned content to {json_path}")
54
+ json.dump(elements, output_f, ensure_ascii=False, indent=2, sort_keys=True)
55
+ return str(json_path)
56
+ except Exception as e:
57
+ if self.pipeline_context.raise_on_error:
58
+ raise
59
+ logger.error(f"failed to partition doc: {ingest_doc_dict}, {e}", exc_info=True)
60
+ return None
@@ -0,0 +1,12 @@
1
+ from dataclasses import dataclass
2
+
3
+ from unstructured_ingest.interfaces import PermissionsCleanupMixin, ProcessorConfig
4
+ from unstructured_ingest.pipeline.interfaces import PermissionsNode
5
+
6
+
7
+ @dataclass
8
+ class PermissionsDataCleaner(PermissionsNode, PermissionsCleanupMixin):
9
+ processor_config: ProcessorConfig
10
+
11
+ def run(self):
12
+ self.cleanup_permissions()
@@ -0,0 +1,117 @@
1
+ import logging
2
+ import multiprocessing as mp
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Optional
5
+
6
+ from dataclasses_json import DataClassJsonMixin
7
+
8
+ from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
9
+ from unstructured_ingest.interfaces import BaseIngestDocBatch, BaseSingleIngestDoc
10
+ from unstructured_ingest.logger import ingest_log_streaming_init, logger
11
+ from unstructured_ingest.pipeline.copy import Copier
12
+ from unstructured_ingest.pipeline.interfaces import (
13
+ DocFactoryNode,
14
+ PartitionNode,
15
+ PipelineContext,
16
+ ReformatNode,
17
+ SourceNode,
18
+ WriteNode,
19
+ )
20
+ from unstructured_ingest.pipeline.permissions import PermissionsDataCleaner
21
+ from unstructured_ingest.pipeline.utils import get_ingest_doc_hash
22
+
23
+
24
+ @dataclass
25
+ class Pipeline(DataClassJsonMixin):
26
+ pipeline_context: PipelineContext
27
+ doc_factory_node: DocFactoryNode
28
+ source_node: SourceNode
29
+ partition_node: Optional[PartitionNode] = None
30
+ write_node: Optional[WriteNode] = None
31
+ reformat_nodes: "list[ReformatNode]" = field(default_factory=list)
32
+ permissions_node: Optional[PermissionsDataCleaner] = None
33
+
34
+ def initialize(self):
35
+ ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
36
+
37
+ def get_nodes_str(self):
38
+ nodes = [self.doc_factory_node, self.source_node, self.partition_node]
39
+ nodes.extend(self.reformat_nodes)
40
+ if self.write_node:
41
+ nodes.append(self.write_node)
42
+ nodes.append(Copier(pipeline_context=self.pipeline_context))
43
+ return " -> ".join([node.__class__.__name__ for node in nodes])
44
+
45
+ def expand_batch_docs(self, dict_docs: "list[dict[str, Any]]") -> "list[dict[str, Any]]":
46
+ expanded_docs: list[dict[str, Any]] = []
47
+ for d in dict_docs:
48
+ doc = create_ingest_doc_from_dict(d)
49
+ if isinstance(doc, BaseSingleIngestDoc):
50
+ expanded_docs.append(doc.to_dict())
51
+ elif isinstance(doc, BaseIngestDocBatch):
52
+ expanded_docs.extend([single_doc.to_dict() for single_doc in doc.ingest_docs])
53
+ else:
54
+ raise ValueError(
55
+ f"type of doc ({type(doc)}) is not a recognized type: "
56
+ f"BaseSingleIngestDoc or BaseSingleIngestDoc"
57
+ )
58
+ return expanded_docs
59
+
60
+ def run(self):
61
+ logger.info(
62
+ f"running pipeline: {self.get_nodes_str()} "
63
+ f"with config: {self.pipeline_context.to_json()}",
64
+ )
65
+ self.initialize()
66
+ manager = mp.Manager()
67
+ self.pipeline_context.ingest_docs_map = manager.dict()
68
+ # -- Get the documents to be processed --
69
+ dict_docs = self.doc_factory_node()
70
+ dict_docs = [manager.dict(d) for d in dict_docs]
71
+ if not dict_docs:
72
+ logger.info("no docs found to process")
73
+ return
74
+ logger.info(
75
+ f"processing {len(dict_docs)} docs via "
76
+ f"{self.pipeline_context.num_processes} processes",
77
+ )
78
+ for doc in dict_docs:
79
+ self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc
80
+ fetched_filenames = self.source_node(iterable=dict_docs)
81
+ if self.source_node.read_config.download_only:
82
+ logger.info("stopping pipeline after downloading files")
83
+ return
84
+ if not fetched_filenames:
85
+ logger.info("No files to run partition over")
86
+ return
87
+ # -- To support batches ingest docs, expand those into the populated single ingest
88
+ # -- docs after downloading content
89
+ dict_docs = self.expand_batch_docs(dict_docs=dict_docs)
90
+ if self.partition_node is None:
91
+ raise ValueError("partition node not set")
92
+ partitioned_jsons = self.partition_node(iterable=dict_docs)
93
+ if not partitioned_jsons:
94
+ logger.info("No files to process after partitioning")
95
+ return
96
+ for reformat_node in self.reformat_nodes:
97
+ reformatted_jsons = reformat_node(iterable=partitioned_jsons)
98
+ if not reformatted_jsons:
99
+ logger.info(f"no files to process after {reformat_node.__class__.__name__}")
100
+ return
101
+ partitioned_jsons = reformatted_jsons
102
+
103
+ # -- Copy the final destination to the desired location --
104
+ copier = Copier(
105
+ pipeline_context=self.pipeline_context,
106
+ )
107
+ copier(iterable=partitioned_jsons)
108
+
109
+ if self.write_node:
110
+ logger.info(
111
+ f"uploading elements from {len(partitioned_jsons)} "
112
+ "document(s) to the destination"
113
+ )
114
+ self.write_node(iterable=partitioned_jsons)
115
+
116
+ if self.permissions_node:
117
+ self.permissions_node.cleanup_permissions()
File without changes