unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import importlib
5
+ from functools import wraps
6
+ from typing import (
7
+ Callable,
8
+ List,
9
+ Optional,
10
+ TypeVar,
11
+ )
12
+
13
+ from typing_extensions import ParamSpec
14
+
15
+ _T = TypeVar("_T")
16
+ _P = ParamSpec("_P")
17
+
18
+
19
+ def requires_dependencies(
20
+ dependencies: str | list[str],
21
+ extras: Optional[str] = None,
22
+ ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
23
+ """Decorator ensuring required modules are installed.
24
+
25
+ Use on functions with local imports to ensure required modules are available and log
26
+ an installation instruction if they're not.
27
+
28
+ Args:
29
+ dependencies: Name(s) of module(s) required by the decorated function.
30
+ extras: unstructured-ingest extra which installs required `dependencies`. Defaults to None.
31
+
32
+ Raises:
33
+ ImportError: When at least one of the `dependencies` is not available.
34
+ """
35
+ if isinstance(dependencies, str):
36
+ dependencies = [dependencies]
37
+
38
+ def decorator(func: Callable[_P, _T]) -> Callable[_P, _T]:
39
+ def run_check():
40
+ missing_deps: List[str] = []
41
+ for dep in dependencies:
42
+ if not dependency_exists(dep):
43
+ missing_deps.append(dep)
44
+ if len(missing_deps) > 0:
45
+ raise ImportError(
46
+ f"Following dependencies are missing: {', '.join(missing_deps)}. "
47
+ + (
48
+ f"""Please install them using `pip install "unstructured-ingest[{extras}]"`.""" # noqa: E501
49
+ if extras
50
+ else f"Please install them using `pip install {' '.join(missing_deps)}`."
51
+ ),
52
+ )
53
+
54
+ @wraps(func)
55
+ def wrapper(*args: _P.args, **kwargs: _P.kwargs):
56
+ run_check()
57
+ return func(*args, **kwargs)
58
+
59
+ @wraps(func)
60
+ async def wrapper_async(*args: _P.args, **kwargs: _P.kwargs):
61
+ run_check()
62
+ return await func(*args, **kwargs)
63
+
64
+ if asyncio.iscoroutinefunction(func):
65
+ return wrapper_async
66
+ return wrapper
67
+
68
+ return decorator
69
+
70
+
71
+ def dependency_exists(dependency: str):
72
+ try:
73
+ importlib.import_module(dependency)
74
+ except ImportError as e:
75
+ # Check to make sure this isn't some unrelated import error.
76
+ if dependency in repr(e):
77
+ return False
78
+ return True
@@ -0,0 +1,9 @@
1
+ GOOGLE_DRIVE_EXPORT_TYPES = {
2
+ "application/vnd.google-apps.document": "application/"
3
+ "vnd.openxmlformats-officedocument.wordprocessingml.document",
4
+ "application/vnd.google-apps.spreadsheet": "application/"
5
+ "vnd.openxmlformats-officedocument.spreadsheetml.sheet",
6
+ "application/vnd.google-apps.presentation": "application/"
7
+ "vnd.openxmlformats-officedocument.presentationml.presentation",
8
+ "application/vnd.google-apps.photo": "image/jpeg",
9
+ }
@@ -0,0 +1,49 @@
1
+ import json
2
+ import typing as t
3
+ from datetime import datetime
4
+
5
+ from dateutil import parser
6
+
7
+
8
+ def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
9
+ """Helper function attempts to deserialize json string to a dictionary."""
10
+ try:
11
+ return json.loads(json_string)
12
+ except json.JSONDecodeError:
13
+ # Not necessary an error if it is a path or malformed json
14
+ pass
15
+ try:
16
+ # This is common when single quotes are used instead of double quotes
17
+ return json.loads(json_string.replace("'", '"'))
18
+ except json.JSONDecodeError:
19
+ # Not necessary an error if it is a path
20
+ pass
21
+ return json_string
22
+
23
+
24
+ def ensure_isoformat_datetime(timestamp: t.Union[datetime, str]) -> str:
25
+ """
26
+ Ensures that the input value is converted to an ISO format datetime string.
27
+ Handles both datetime objects and strings.
28
+ """
29
+ if isinstance(timestamp, datetime):
30
+ return timestamp.isoformat()
31
+ elif isinstance(timestamp, str):
32
+ try:
33
+ # Parse the datetime string in various formats
34
+ dt = parser.parse(timestamp)
35
+ return dt.isoformat()
36
+ except ValueError as e:
37
+ raise ValueError(f"String '{timestamp}' could not be parsed as a datetime.") from e
38
+ else:
39
+ raise TypeError(f"Expected input type datetime or str, but got {type(timestamp)}.")
40
+
41
+
42
+ def truncate_string_bytes(string: str, max_bytes: int, encoding: str = "utf-8") -> str:
43
+ """
44
+ Truncates a string to a specified maximum number of bytes.
45
+ """
46
+ encoded_string = str(string).encode(encoding)
47
+ if len(encoded_string) <= max_bytes:
48
+ return string
49
+ return encoded_string[:max_bytes].decode(encoding, errors="ignore")
@@ -0,0 +1,73 @@
1
+ from typing import Any
2
+
3
+ import pandas as pd
4
+
5
+ from unstructured_ingest.utils.data_prep import flatten_dict
6
+
7
+
8
+ def get_default_pandas_dtypes() -> dict[str, Any]:
9
+ return {
10
+ "text": pd.StringDtype(), # type: ignore
11
+ "type": pd.StringDtype(), # type: ignore
12
+ "element_id": pd.StringDtype(), # type: ignore
13
+ "filename": pd.StringDtype(), # Optional[str] # type: ignore
14
+ "filetype": pd.StringDtype(), # Optional[str] # type: ignore
15
+ "file_directory": pd.StringDtype(), # Optional[str] # type: ignore
16
+ "last_modified": pd.StringDtype(), # Optional[str] # type: ignore
17
+ "attached_to_filename": pd.StringDtype(), # Optional[str] # type: ignore
18
+ "parent_id": pd.StringDtype(), # Optional[str], # type: ignore
19
+ "category_depth": "Int64", # Optional[int]
20
+ "image_path": pd.StringDtype(), # Optional[str] # type: ignore
21
+ "languages": object, # Optional[list[str]]
22
+ "page_number": "Int64", # Optional[int]
23
+ "page_name": pd.StringDtype(), # Optional[str] # type: ignore
24
+ "url": pd.StringDtype(), # Optional[str] # type: ignore
25
+ "link_urls": pd.StringDtype(), # Optional[str] # type: ignore
26
+ "link_texts": object, # Optional[list[str]]
27
+ "links": object,
28
+ "sent_from": object, # Optional[list[str]],
29
+ "sent_to": object, # Optional[list[str]]
30
+ "subject": pd.StringDtype(), # Optional[str] # type: ignore
31
+ "section": pd.StringDtype(), # Optional[str] # type: ignore
32
+ "header_footer_type": pd.StringDtype(), # Optional[str] # type: ignore
33
+ "emphasized_text_contents": object, # Optional[list[str]]
34
+ "emphasized_text_tags": object, # Optional[list[str]]
35
+ "text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
36
+ "regex_metadata": object,
37
+ "max_characters": "Int64", # Optional[int]
38
+ "is_continuation": "boolean", # Optional[bool]
39
+ "detection_class_prob": float, # Optional[float],
40
+ "sender": pd.StringDtype(), # type: ignore
41
+ "coordinates_points": object,
42
+ "coordinates_system": pd.StringDtype(), # type: ignore
43
+ "coordinates_layout_width": float,
44
+ "coordinates_layout_height": float,
45
+ "data_source_url": pd.StringDtype(), # Optional[str] # type: ignore
46
+ "data_source_version": pd.StringDtype(), # Optional[str] # type: ignore
47
+ "data_source_record_locator": object,
48
+ "data_source_date_created": pd.StringDtype(), # Optional[str] # type: ignore
49
+ "data_source_date_modified": pd.StringDtype(), # Optional[str] # type: ignore
50
+ "data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
51
+ "data_source_permissions_data": object,
52
+ "embeddings": object,
53
+ "regex_metadata_key": object,
54
+ }
55
+
56
+
57
+ def convert_to_pandas_dataframe(
58
+ elements_dict: list[dict[str, Any]],
59
+ drop_empty_cols: bool = False,
60
+ ) -> pd.DataFrame:
61
+ # Flatten metadata if it hasn't already been flattened
62
+ for d in elements_dict:
63
+ if metadata := d.pop("metadata", None):
64
+ d.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
65
+
66
+ df = pd.DataFrame.from_dict(
67
+ elements_dict,
68
+ )
69
+ dt = {k: v for k, v in get_default_pandas_dtypes().items() if k in df.columns}
70
+ df = df.astype(dt)
71
+ if drop_empty_cols:
72
+ df.dropna(axis=1, how="all", inplace=True)
73
+ return df
@@ -0,0 +1 @@
1
+ from __future__ import annotations
File without changes
@@ -0,0 +1,4 @@
1
+ from .dest import DestCmd
2
+ from .src import SrcCmd
3
+
4
+ __all__ = ["SrcCmd", "DestCmd"]
@@ -0,0 +1,269 @@
1
+ import inspect
2
+ from abc import ABC, abstractmethod
3
+ from collections import Counter
4
+ from dataclasses import dataclass, field, fields
5
+ from typing import Any, Optional, Type, TypeVar
6
+
7
+ import click
8
+ from pydantic import BaseModel
9
+
10
+ from unstructured_ingest.v2.cli.base.importer import import_from_string
11
+ from unstructured_ingest.v2.cli.utils.click import extract_config
12
+ from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model, post_check
13
+ from unstructured_ingest.v2.interfaces import ProcessorConfig
14
+ from unstructured_ingest.v2.logger import logger
15
+ from unstructured_ingest.v2.pipeline.pipeline import Pipeline
16
+ from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
17
+ from unstructured_ingest.v2.processes.connector_registry import (
18
+ DownloaderT,
19
+ IndexerT,
20
+ RegistryEntry,
21
+ UploaderT,
22
+ UploadStager,
23
+ UploadStagerConfig,
24
+ UploadStagerT,
25
+ destination_registry,
26
+ source_registry,
27
+ )
28
+ from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
29
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
30
+ from unstructured_ingest.v2.processes.filter import Filterer, FiltererConfig
31
+ from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
32
+
33
+ CommandT = TypeVar("CommandT", bound=click.Command)
34
+
35
+
36
+ @dataclass
37
+ class BaseCmd(ABC):
38
+ cmd_name: str
39
+ registry_entry: RegistryEntry
40
+ default_configs: list[Type[BaseModel]] = field(default_factory=list)
41
+
42
+ @abstractmethod
43
+ def get_registry_options(self):
44
+ pass
45
+
46
+ def get_default_options(self) -> list[click.Option]:
47
+ options = []
48
+ for extra in self.default_configs:
49
+ options.extend(options_from_base_model(model=extra))
50
+ return options
51
+
52
+ @classmethod
53
+ def consolidate_options(cls, options: list[click.Option]) -> list[click.Option]:
54
+ option_names = [option.name for option in options]
55
+ duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
56
+ if not duplicate_names:
57
+ return options
58
+ consolidated_options = []
59
+ current_names = []
60
+ for option in options:
61
+ if option.name not in current_names:
62
+ current_names.append(option.name)
63
+ consolidated_options.append(option)
64
+ continue
65
+ existing_option = next(o for o in consolidated_options if o.name == option.name)
66
+ if existing_option.__dict__ == option.__dict__:
67
+ continue
68
+ option_diff = cls.get_options_diff(o1=option, o2=existing_option)
69
+ raise ValueError(
70
+ "Conflicting duplicate {} option defined: {}".format(
71
+ option.name, " | ".join([f"{d[0]}: {d[1]}" for d in option_diff])
72
+ )
73
+ )
74
+ return consolidated_options
75
+
76
+ @staticmethod
77
+ def get_options_diff(o1: click.Option, o2: click.Option):
78
+ o1_dict = o1.__dict__
79
+ o2_dict = o2.__dict__
80
+ for d in [o1_dict, o2_dict]:
81
+ d["opts"] = ",".join(d["opts"])
82
+ d["secondary_opts"] = ",".join(d["secondary_opts"])
83
+ option_diff = set(o1_dict.items()) ^ set(o2_dict.items())
84
+ return option_diff
85
+
86
+ @property
87
+ def cmd_name_key(self):
88
+ return self.cmd_name.replace("-", "_")
89
+
90
+ @property
91
+ def cli_cmd_name(self):
92
+ return self.cmd_name.replace("_", "-")
93
+
94
+ @abstractmethod
95
+ def cmd(self, ctx: click.Context, **options) -> None:
96
+ pass
97
+
98
+ def add_options(self, cmd: CommandT) -> CommandT:
99
+ options = self.get_registry_options()
100
+ options.extend(self.get_default_options())
101
+ post_check(options)
102
+ cmd.params.extend(options)
103
+ return cmd
104
+
105
+ def get_pipeline(
106
+ self,
107
+ src: str,
108
+ source_options: dict[str, Any],
109
+ dest: Optional[str] = None,
110
+ destination_options: Optional[dict[str, Any]] = None,
111
+ ) -> Pipeline:
112
+ logger.debug(
113
+ f"creating pipeline from cli using source {src} with options: {source_options}"
114
+ )
115
+ pipeline_kwargs: dict[str, Any] = {
116
+ "context": self.get_processor_config(options=source_options),
117
+ "downloader": self.get_downloader(src=src, options=source_options),
118
+ "indexer": self.get_indexer(src=src, options=source_options),
119
+ "partitioner": self.get_partitioner(options=source_options),
120
+ }
121
+ if chunker := self.get_chunker(options=source_options):
122
+ pipeline_kwargs["chunker"] = chunker
123
+ if filterer := self.get_filterer(options=source_options):
124
+ pipeline_kwargs["filterer"] = filterer
125
+ if embedder := self.get_embedder(options=source_options):
126
+ pipeline_kwargs["embedder"] = embedder
127
+ if dest:
128
+ logger.debug(
129
+ f"setting destination on pipeline {dest} with options: {destination_options}"
130
+ )
131
+ if uploader_stager := self.get_upload_stager(dest=dest, options=destination_options):
132
+ pipeline_kwargs["stager"] = uploader_stager
133
+ pipeline_kwargs["uploader"] = self.get_uploader(dest=dest, options=destination_options)
134
+ else:
135
+ # Default to local uploader
136
+ # TODO remove after v1 no longer supported
137
+ destination_options = destination_options or {}
138
+ if "output_dir" not in destination_options:
139
+ destination_options["output_dir"] = source_options["output_dir"]
140
+ pipeline_kwargs["uploader"] = self.get_default_uploader(options=destination_options)
141
+ return Pipeline(**pipeline_kwargs)
142
+
143
+ @staticmethod
144
+ def get_default_uploader(options: dict[str, Any]) -> UploaderT:
145
+ uploader_config = extract_config(flat_data=options, config=LocalUploaderConfig)
146
+ return LocalUploader(upload_config=uploader_config)
147
+
148
+ @staticmethod
149
+ def get_chunker(options: dict[str, Any]) -> Optional[Chunker]:
150
+ chunker_config = extract_config(flat_data=options, config=ChunkerConfig)
151
+ if not chunker_config.chunking_strategy:
152
+ return None
153
+ return Chunker(config=chunker_config)
154
+
155
+ @staticmethod
156
+ def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
157
+ filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
158
+ if not filterer_configs.model_dump():
159
+ return None
160
+ return Filterer(config=filterer_configs)
161
+
162
+ @staticmethod
163
+ def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
164
+ embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
165
+ if not embedder_config.embedding_provider:
166
+ return None
167
+ return Embedder(config=embedder_config)
168
+
169
+ @staticmethod
170
+ def get_partitioner(options: dict[str, Any]) -> Partitioner:
171
+ partitioner_config = extract_config(flat_data=options, config=PartitionerConfig)
172
+ return Partitioner(config=partitioner_config)
173
+
174
+ @staticmethod
175
+ def get_processor_config(options: dict[str, Any]) -> ProcessorConfig:
176
+ return extract_config(flat_data=options, config=ProcessorConfig)
177
+
178
+ @staticmethod
179
+ def get_indexer(src: str, options: dict[str, Any]) -> IndexerT:
180
+ source_entry = source_registry[src]
181
+ indexer_kwargs: dict[str, Any] = {}
182
+ if indexer_config_cls := source_entry.indexer_config:
183
+ indexer_kwargs["index_config"] = extract_config(
184
+ flat_data=options, config=indexer_config_cls
185
+ )
186
+ if connection_config_cls := source_entry.connection_config:
187
+ indexer_kwargs["connection_config"] = extract_config(
188
+ flat_data=options, config=connection_config_cls
189
+ )
190
+ indexer_cls = source_entry.indexer
191
+ return indexer_cls(**indexer_kwargs)
192
+
193
+ @staticmethod
194
+ def get_downloader(src: str, options: dict[str, Any]) -> DownloaderT:
195
+ source_entry = source_registry[src]
196
+ downloader_kwargs: dict[str, Any] = {}
197
+ if downloader_config_cls := source_entry.downloader_config:
198
+ downloader_kwargs["download_config"] = extract_config(
199
+ flat_data=options, config=downloader_config_cls
200
+ )
201
+ if connection_config_cls := source_entry.connection_config:
202
+ downloader_kwargs["connection_config"] = extract_config(
203
+ flat_data=options, config=connection_config_cls
204
+ )
205
+ downloader_cls = source_entry.downloader
206
+ return downloader_cls(**downloader_kwargs)
207
+
208
+ @staticmethod
209
+ def get_custom_stager(
210
+ stager_reference: str, stager_config_kwargs: Optional[dict] = None
211
+ ) -> Optional[UploadStagerT]:
212
+ uploader_cls = import_from_string(stager_reference)
213
+ if not inspect.isclass(uploader_cls):
214
+ raise ValueError(
215
+ f"custom stager must be a reference to a python class, got: {type(uploader_cls)}"
216
+ )
217
+ if not issubclass(uploader_cls, UploadStager):
218
+ raise ValueError(
219
+ "custom stager must be an implementation of the UploadStager interface"
220
+ )
221
+ fields_dict = {f.name: f.type for f in fields(uploader_cls)}
222
+ upload_stager_config_cls = fields_dict["upload_stager_config"]
223
+ if not inspect.isclass(upload_stager_config_cls):
224
+ raise ValueError(
225
+ f"custom stager config must be a class, got: {type(upload_stager_config_cls)}"
226
+ )
227
+ if not issubclass(upload_stager_config_cls, UploadStagerConfig):
228
+ raise ValueError(
229
+ "custom stager config must be an implementation "
230
+ "of the UploadStagerUploadStagerConfig interface"
231
+ )
232
+ upload_stager_kwargs: dict[str, Any] = {}
233
+ if stager_config_kwargs:
234
+ upload_stager_kwargs["upload_stager_config"] = upload_stager_config_cls(
235
+ **stager_config_kwargs
236
+ )
237
+ return uploader_cls(**upload_stager_kwargs)
238
+
239
+ @staticmethod
240
+ def get_upload_stager(dest: str, options: dict[str, Any]) -> Optional[UploadStagerT]:
241
+ if custom_stager := options.get("custom_stager"):
242
+ return BaseCmd.get_custom_stager(
243
+ stager_reference=custom_stager,
244
+ stager_config_kwargs=options.get("custom_stager_config_kwargs"),
245
+ )
246
+ dest_entry = destination_registry[dest]
247
+ upload_stager_kwargs: dict[str, Any] = {}
248
+ if upload_stager_config_cls := dest_entry.upload_stager_config:
249
+ upload_stager_kwargs["upload_stager_config"] = extract_config(
250
+ flat_data=options, config=upload_stager_config_cls
251
+ )
252
+ if upload_stager_cls := dest_entry.upload_stager:
253
+ return upload_stager_cls(**upload_stager_kwargs)
254
+ return None
255
+
256
+ @staticmethod
257
+ def get_uploader(dest, options: dict[str, Any]) -> UploaderT:
258
+ dest_entry = destination_registry[dest]
259
+ uploader_kwargs: dict[str, Any] = {}
260
+ if uploader_config_cls := dest_entry.uploader_config:
261
+ uploader_kwargs["upload_config"] = extract_config(
262
+ flat_data=options, config=uploader_config_cls
263
+ )
264
+ if connection_config_cls := dest_entry.connection_config:
265
+ uploader_kwargs["connection_config"] = extract_config(
266
+ flat_data=options, config=connection_config_cls
267
+ )
268
+ uploader_cls = dest_entry.uploader
269
+ return uploader_cls(**uploader_kwargs)
@@ -0,0 +1,85 @@
1
+ import logging
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
7
+ from unstructured_ingest.v2.cli.utils.click import Dict, conform_click_options
8
+ from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model
9
+ from unstructured_ingest.v2.logger import logger
10
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
11
+
12
+
13
+ @dataclass
14
+ class DestCmd(BaseCmd):
15
+ registry_entry: DestinationRegistryEntry
16
+
17
+ def get_registry_options(self):
18
+ options = []
19
+ configs = [
20
+ config
21
+ for config in [
22
+ self.registry_entry.uploader_config,
23
+ self.registry_entry.upload_stager_config,
24
+ self.registry_entry.connection_config,
25
+ ]
26
+ if config
27
+ ]
28
+ for config in configs:
29
+ options.extend(options_from_base_model(model=config))
30
+ options = self.consolidate_options(options=options)
31
+ return options
32
+
33
+ def cmd(self, ctx: click.Context, **options) -> None:
34
+ logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
35
+ if not ctx.parent:
36
+ raise click.ClickException("destination command called without a parent")
37
+ if not ctx.parent.info_name:
38
+ raise click.ClickException("parent command missing info name")
39
+ source_cmd = ctx.parent.info_name.replace("-", "_")
40
+ source_options: dict = ctx.parent.params if ctx.parent else {}
41
+ conform_click_options(options)
42
+ try:
43
+ pipeline = self.get_pipeline(
44
+ src=source_cmd,
45
+ source_options=source_options,
46
+ dest=self.cmd_name,
47
+ destination_options=options,
48
+ )
49
+ pipeline.run()
50
+ except Exception as e:
51
+ logger.error(f"failed to run destination command {self.cmd_name}: {e}", exc_info=True)
52
+ raise click.ClickException(str(e)) from e
53
+
54
+ def get_cmd(self) -> click.Command:
55
+ # Dynamically create the command without the use of click decorators
56
+ fn = self.cmd
57
+ fn = click.pass_context(fn)
58
+ cmd = click.command(fn)
59
+ if not isinstance(cmd, click.core.Command):
60
+ raise ValueError(f"generated command was not of expected type Command: {type(cmd)}")
61
+ cmd.name = self.cli_cmd_name
62
+ cmd.short_help = "v2"
63
+ cmd.invoke_without_command = True
64
+ self.add_options(cmd)
65
+ cmd.params.append(
66
+ click.Option(
67
+ ["--custom-stager"],
68
+ required=False,
69
+ type=str,
70
+ default=None,
71
+ help="Pass a pointer to a custom upload stager to use, "
72
+ "must be in format '<module>:<attribute>'",
73
+ )
74
+ )
75
+ cmd.params.append(
76
+ click.Option(
77
+ ["--custom-stager-config-kwargs"],
78
+ required=False,
79
+ type=Dict(),
80
+ default=None,
81
+ help="Any kwargs to instantiate the configuration "
82
+ "associated with the customer stager",
83
+ )
84
+ )
85
+ return cmd
@@ -0,0 +1,34 @@
1
+ import importlib
2
+ from typing import Any
3
+
4
+
5
+ class ImportFromStringError(Exception):
6
+ pass
7
+
8
+
9
+ def import_from_string(import_str: Any) -> Any:
10
+ if not isinstance(import_str, str):
11
+ return import_str
12
+
13
+ module_str, _, attrs_str = import_str.partition(":")
14
+ if not module_str or not attrs_str:
15
+ message = 'Import string "{import_str}" must be in format "<module>:<attribute>".'
16
+ raise ImportFromStringError(message.format(import_str=import_str))
17
+
18
+ try:
19
+ module = importlib.import_module(module_str)
20
+ except ModuleNotFoundError as exc:
21
+ if exc.name != module_str:
22
+ raise exc from None
23
+ message = 'Could not import module "{module_str}".'
24
+ raise ImportFromStringError(message.format(module_str=module_str))
25
+
26
+ instance = module
27
+ try:
28
+ for attr_str in attrs_str.split("."):
29
+ instance = getattr(instance, attr_str)
30
+ except AttributeError:
31
+ message = 'Attribute "{attrs_str}" not found in module "{module_str}".'
32
+ raise ImportFromStringError(message.format(attrs_str=attrs_str, module_str=module_str))
33
+
34
+ return instance