unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,24 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.fsspec.azure import (
9
+ AzureWriteConfig,
10
+ SimpleAzureBlobStorageConfig,
11
+ )
12
+
13
+
14
+ @dataclass
15
+ class AzureWriter(Writer):
16
+ connector_config: "SimpleAzureBlobStorageConfig"
17
+ write_config: "AzureWriteConfig"
18
+
19
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
20
+ from unstructured_ingest.connector.fsspec.azure import (
21
+ AzureBlobStorageDestinationConnector,
22
+ )
23
+
24
+ return AzureBlobStorageDestinationConnector
@@ -0,0 +1,21 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.fsspec.box import BoxWriteConfig, SimpleBoxConfig
9
+
10
+
11
+ @dataclass
12
+ class BoxWriter(Writer):
13
+ connector_config: "SimpleBoxConfig"
14
+ write_config: "BoxWriteConfig"
15
+
16
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
+ from unstructured_ingest.connector.fsspec.box import (
18
+ BoxDestinationConnector,
19
+ )
20
+
21
+ return BoxDestinationConnector
@@ -0,0 +1,21 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.fsspec.dropbox import DropboxWriteConfig, SimpleDropboxConfig
9
+
10
+
11
+ @dataclass
12
+ class DropboxWriter(Writer):
13
+ connector_config: "SimpleDropboxConfig"
14
+ write_config: "DropboxWriteConfig"
15
+
16
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
+ from unstructured_ingest.connector.fsspec.dropbox import (
18
+ DropboxDestinationConnector,
19
+ )
20
+
21
+ return DropboxDestinationConnector
@@ -0,0 +1,19 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.fsspec.gcs import GcsWriteConfig, SimpleGcsConfig
9
+
10
+
11
+ @dataclass
12
+ class GcsWriter(Writer):
13
+ connector_config: "SimpleGcsConfig"
14
+ write_config: "GcsWriteConfig"
15
+
16
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
+ from unstructured_ingest.connector.fsspec.gcs import GcsDestinationConnector
18
+
19
+ return GcsDestinationConnector
@@ -0,0 +1,21 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.fsspec.s3 import S3WriteConfig, SimpleS3Config
9
+
10
+
11
+ @dataclass
12
+ class S3Writer(Writer):
13
+ connector_config: "SimpleS3Config"
14
+ write_config: "S3WriteConfig"
15
+
16
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
+ from unstructured_ingest.connector.fsspec.s3 import (
18
+ S3DestinationConnector,
19
+ )
20
+
21
+ return S3DestinationConnector
@@ -0,0 +1,21 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.kafka import KafkaWriteConfig, SimpleKafkaConfig
9
+
10
+
11
+ @dataclass
12
+ class KafkaWriter(Writer):
13
+ write_config: "KafkaWriteConfig"
14
+ connector_config: "SimpleKafkaConfig"
15
+
16
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
+ from unstructured_ingest.connector.kafka import (
18
+ KafkaDestinationConnector,
19
+ )
20
+
21
+ return KafkaDestinationConnector
@@ -0,0 +1,21 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.mongodb import MongoDBWriteConfig, SimpleMongoDBConfig
9
+
10
+
11
+ @dataclass
12
+ class MongodbWriter(Writer):
13
+ write_config: "MongoDBWriteConfig"
14
+ connector_config: "SimpleMongoDBConfig"
15
+
16
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
+ from unstructured_ingest.connector.mongodb import (
18
+ MongoDBDestinationConnector,
19
+ )
20
+
21
+ return MongoDBDestinationConnector
@@ -0,0 +1,26 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.elasticsearch import (
9
+ ElasticsearchWriteConfig,
10
+ )
11
+ from unstructured_ingest.connector.opensearch import (
12
+ SimpleOpenSearchConfig,
13
+ )
14
+
15
+
16
+ @dataclass
17
+ class OpenSearchWriter(Writer):
18
+ connector_config: "SimpleOpenSearchConfig"
19
+ write_config: "ElasticsearchWriteConfig"
20
+
21
+ def get_connector_cls(self) -> BaseDestinationConnector:
22
+ from unstructured_ingest.connector.opensearch import (
23
+ OpenSearchDestinationConnector,
24
+ )
25
+
26
+ return OpenSearchDestinationConnector
@@ -0,0 +1,21 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.pinecone import PineconeWriteConfig, SimplePineconeConfig
9
+
10
+
11
+ @dataclass
12
+ class PineconeWriter(Writer):
13
+ write_config: "PineconeWriteConfig"
14
+ connector_config: "SimplePineconeConfig"
15
+
16
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
+ from unstructured_ingest.connector.pinecone import (
18
+ PineconeDestinationConnector,
19
+ )
20
+
21
+ return PineconeDestinationConnector
@@ -0,0 +1,19 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.qdrant import QdrantWriteConfig, SimpleQdrantConfig
9
+
10
+
11
+ @dataclass
12
+ class QdrantWriter(Writer):
13
+ write_config: "QdrantWriteConfig"
14
+ connector_config: "SimpleQdrantConfig"
15
+
16
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
+ from unstructured_ingest.connector.qdrant import QdrantDestinationConnector
18
+
19
+ return QdrantDestinationConnector
@@ -0,0 +1,22 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.sql import SimpleSqlConfig
9
+ from unstructured_ingest.interfaces import WriteConfig
10
+
11
+
12
+ @dataclass
13
+ class SqlWriter(Writer):
14
+ write_config: "WriteConfig"
15
+ connector_config: "SimpleSqlConfig"
16
+
17
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
18
+ from unstructured_ingest.connector.sql import (
19
+ SqlDestinationConnector,
20
+ )
21
+
22
+ return SqlDestinationConnector
@@ -0,0 +1,22 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
5
+ from unstructured_ingest.interfaces import BaseDestinationConnector
6
+ from unstructured_ingest.runner.writers.base_writer import Writer
7
+
8
+ if t.TYPE_CHECKING:
9
+ from unstructured_ingest.connector.vectara import SimpleVectaraConfig, VectaraWriteConfig
10
+
11
+
12
+ @dataclass
13
+ class VectaraWriter(Writer, EnhancedDataClassJsonMixin):
14
+ write_config: "VectaraWriteConfig"
15
+ connector_config: "SimpleVectaraConfig"
16
+
17
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
18
+ from unstructured_ingest.connector.vectara import (
19
+ VectaraDestinationConnector,
20
+ )
21
+
22
+ return VectaraDestinationConnector
@@ -0,0 +1,21 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.weaviate import SimpleWeaviateConfig, WeaviateWriteConfig
9
+
10
+
11
+ @dataclass
12
+ class WeaviateWriter(Writer):
13
+ write_config: "WeaviateWriteConfig"
14
+ connector_config: "SimpleWeaviateConfig"
15
+
16
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
+ from unstructured_ingest.connector.weaviate import (
18
+ WeaviateDestinationConnector,
19
+ )
20
+
21
+ return WeaviateDestinationConnector
File without changes
@@ -0,0 +1,56 @@
1
+ import base64
2
+ import hashlib
3
+ import json
4
+ import zlib
5
+ from itertools import groupby
6
+
7
+
8
+ def id_to_hash(element: dict, sequence_number: int) -> str:
9
+ """Calculates and assigns a deterministic hash as an ID.
10
+
11
+ The hash ID is based on element's text, sequence number on page,
12
+ page number and its filename.
13
+
14
+ Args:
15
+ sequence_number: index on page
16
+
17
+ Returns: new ID value
18
+ """
19
+ filename = element["metadata"].get("filename")
20
+ text = element["text"]
21
+ page_number = element["metadata"].get("page_number")
22
+ data = f"{filename}{text}{page_number}{sequence_number}"
23
+ element["element_id"] = hashlib.sha256(data.encode()).hexdigest()[:32]
24
+ return element["element_id"]
25
+
26
+
27
+ def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
28
+ # -- generate sequence number for each element on a page --
29
+ elements = elements.copy()
30
+ page_numbers = [e["metadata"].get("page_number") for e in elements]
31
+ page_seq_pairs = [
32
+ seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
33
+ ]
34
+
35
+ # -- assign hash IDs to elements --
36
+ old_to_new_mapping = {
37
+ element["element_id"]: id_to_hash(element=element, sequence_number=seq_on_page_counter)
38
+ for element, seq_on_page_counter in zip(elements, page_seq_pairs)
39
+ }
40
+
41
+ # -- map old parent IDs to new ones --
42
+ for e in elements:
43
+ parent_id = e["metadata"].get("parent_id")
44
+ if not parent_id:
45
+ continue
46
+ e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
47
+
48
+ return elements
49
+
50
+
51
+ def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
52
+ decoded_b64_bytes = base64.b64decode(raw_s)
53
+ elements_json_bytes = zlib.decompress(decoded_b64_bytes)
54
+ elements_json_str = elements_json_bytes.decode("utf-8")
55
+ element_dicts = json.loads(elements_json_str)
56
+ return element_dicts
@@ -0,0 +1,118 @@
1
+ import copy
2
+ import os
3
+ import sys
4
+ import tarfile
5
+ import zipfile
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import List, Optional
9
+
10
+ from unstructured_ingest.connector.local import LocalSourceConnector, SimpleLocalConfig
11
+ from unstructured_ingest.interfaces import (
12
+ BaseConnectorConfig,
13
+ BaseSingleIngestDoc,
14
+ ProcessorConfig,
15
+ ReadConfig,
16
+ )
17
+ from unstructured_ingest.logger import logger
18
+
19
+ ZIP_FILE_EXT = [".zip"]
20
+ TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
21
+
22
+
23
+ def uncompress_file(filename: str, path: Optional[str] = None) -> str:
24
+ """
25
+ Takes in a compressed zip or tar file and decompresses it
26
+ """
27
+ # Create path if it doesn't already exist
28
+ if path:
29
+ Path(path).mkdir(parents=True, exist_ok=True)
30
+
31
+ if any(filename.endswith(ext) for ext in ZIP_FILE_EXT):
32
+ return uncompress_zip_file(zip_filename=filename, path=path)
33
+ elif any(filename.endswith(ext) for ext in TAR_FILE_EXT):
34
+ return uncompress_tar_file(tar_filename=filename, path=path)
35
+ else:
36
+ raise ValueError(
37
+ "filename {} not a recognized compressed extension: {}".format(
38
+ filename,
39
+ ", ".join(ZIP_FILE_EXT + TAR_FILE_EXT),
40
+ ),
41
+ )
42
+
43
+
44
+ def uncompress_zip_file(zip_filename: str, path: Optional[str] = None) -> str:
45
+ head, tail = os.path.split(zip_filename)
46
+ for ext in ZIP_FILE_EXT:
47
+ if tail.endswith(ext):
48
+ tail = tail[: -(len(ext))]
49
+ break
50
+ path = path if path else os.path.join(head, f"{tail}-zip-uncompressed")
51
+ logger.info(f"extracting zip {zip_filename} -> {path}")
52
+ with zipfile.ZipFile(zip_filename) as zfile:
53
+ zfile.extractall(path=path)
54
+ return path
55
+
56
+
57
+ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
58
+ head, tail = os.path.split(tar_filename)
59
+ for ext in TAR_FILE_EXT:
60
+ if tail.endswith(ext):
61
+ tail = tail[: -(len(ext))]
62
+ break
63
+
64
+ path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
65
+ logger.info(f"extracting tar {tar_filename} -> {path}")
66
+ # NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
67
+ with tarfile.open(tar_filename, "r:*") as tfile:
68
+ # NOTE(robinson): Mitigate against malicious content being extracted from the tar file.
69
+ # This was added in Python 3.12
70
+ # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
71
+ if sys.version_info >= (3, 12):
72
+ tfile.extraction_filter = tarfile.tar_filter
73
+ else:
74
+ logger.warning(
75
+ "Extraction filtering for tar files is available for Python 3.12 and above. "
76
+ "Consider upgrading your Python version to improve security. "
77
+ "See https://docs.python.org/3/library/tarfile.html#extraction-filters"
78
+ )
79
+ tfile.extractall(path=path)
80
+ return path
81
+
82
+
83
+ @dataclass
84
+ class CompressionSourceConnectorMixin:
85
+ processor_config: ProcessorConfig
86
+ read_config: ReadConfig
87
+ connector_config: BaseConnectorConfig
88
+
89
+ def process_compressed_doc(self, doc: BaseSingleIngestDoc) -> List[BaseSingleIngestDoc]:
90
+ """
91
+ Utility function which helps process compressed files. Extracts the contents and returns
92
+ generated ingest docs via local source connector
93
+ """
94
+ # Download the raw file to local
95
+ doc.get_file()
96
+ path = uncompress_file(filename=str(doc.filename))
97
+ new_read_configs = copy.copy(self.read_config)
98
+ new_process_configs = copy.copy(self.processor_config)
99
+ relative_path = path.replace(self.read_config.download_dir, "")
100
+
101
+ if self.processor_config.output_dir.endswith(os.sep):
102
+ new_process_configs.output_dir = f"{self.processor_config.output_dir}{relative_path}"
103
+ else:
104
+ new_process_configs.output_dir = (
105
+ f"{self.processor_config.output_dir}{os.sep}{relative_path}"
106
+ )
107
+
108
+ local_connector = LocalSourceConnector(
109
+ connector_config=SimpleLocalConfig(
110
+ input_path=path,
111
+ recursive=True,
112
+ ),
113
+ read_config=new_read_configs,
114
+ processor_config=new_process_configs,
115
+ )
116
+ logger.info(f"created local source connector: {local_connector.to_json()}")
117
+ local_connector.initialize()
118
+ return local_connector.get_ingest_docs()
@@ -0,0 +1,200 @@
1
+ import itertools
2
+ import json
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
6
+
7
+ import ndjson
8
+ import pandas as pd
9
+
10
+ from unstructured_ingest.v2.logger import logger
11
+
12
+ DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
13
+
14
+ T = TypeVar("T")
15
+ IterableT = Iterable[T]
16
+
17
+
18
+ def split_dataframe(df: pd.DataFrame, chunk_size: int = 100) -> Generator[pd.DataFrame, None, None]:
19
+ num_chunks = len(df) // chunk_size + 1
20
+ for i in range(num_chunks):
21
+ yield df[i * chunk_size : (i + 1) * chunk_size]
22
+
23
+
24
+ def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
25
+ """A helper function to break an iterable into batches of size batch_size."""
26
+ it = iter(iterable)
27
+ chunk = tuple(itertools.islice(it, batch_size))
28
+ while chunk:
29
+ yield chunk
30
+ chunk = tuple(itertools.islice(it, batch_size))
31
+
32
+
33
+ def generator_batching_wbytes(
34
+ iterable: IterableT,
35
+ batch_size_limit_bytes: Optional[int] = None,
36
+ max_batch_size: Optional[int] = None,
37
+ ) -> IterableT:
38
+ if not batch_size_limit_bytes and not max_batch_size:
39
+ return iterable
40
+ """A helper function to break an iterable into chunks of specified bytes."""
41
+ current_batch, current_batch_size = [], 0
42
+
43
+ for item in iterable:
44
+ item_size_bytes = len(json.dumps(item).encode("utf-8"))
45
+ if batch_size_limit_bytes and current_batch_size + item_size_bytes > batch_size_limit_bytes:
46
+ yield current_batch
47
+ current_batch, current_batch_size = [item], item_size_bytes
48
+ continue
49
+ if max_batch_size and len(current_batch) + 1 > max_batch_size:
50
+ yield current_batch
51
+ current_batch, current_batch_size = [item], item_size_bytes
52
+ continue
53
+
54
+ current_batch.append(item)
55
+ current_batch_size += item_size_bytes
56
+
57
+ if current_batch:
58
+ yield current_batch
59
+
60
+
61
+ def flatten_dict(
62
+ dictionary: dict[str, Any],
63
+ parent_key: str = "",
64
+ separator: str = "_",
65
+ flatten_lists: bool = False,
66
+ remove_none: bool = False,
67
+ keys_to_omit: Optional[Sequence[str]] = None,
68
+ ) -> dict[str, Any]:
69
+ """Flattens a nested dictionary into a single level dictionary.
70
+
71
+ keys_to_omit is a list of keys that don't get flattened. If omitting a nested key, format as
72
+ {parent_key}{separator}{key}. If flatten_lists is True, then lists and tuples are flattened as
73
+ well. If remove_none is True, then None keys/values are removed from the flattened
74
+ dictionary.
75
+ """
76
+ keys_to_omit = keys_to_omit if keys_to_omit else []
77
+ flattened_dict: dict[str, Any] = {}
78
+ for key, value in dictionary.items():
79
+ new_key = f"{parent_key}{separator}{key}" if parent_key else key
80
+ if new_key in keys_to_omit:
81
+ flattened_dict[new_key] = value
82
+ elif value is None and remove_none:
83
+ continue
84
+ elif isinstance(value, dict):
85
+ value = cast("dict[str, Any]", value)
86
+ flattened_dict.update(
87
+ flatten_dict(
88
+ value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
89
+ ),
90
+ )
91
+ elif isinstance(value, (list, tuple)) and flatten_lists:
92
+ value = cast("list[Any] | tuple[Any]", value)
93
+ for index, item in enumerate(value):
94
+ flattened_dict.update(
95
+ flatten_dict(
96
+ {f"{new_key}{separator}{index}": item},
97
+ "",
98
+ separator,
99
+ flatten_lists,
100
+ remove_none,
101
+ keys_to_omit=keys_to_omit,
102
+ )
103
+ )
104
+ else:
105
+ flattened_dict[new_key] = value
106
+
107
+ return flattened_dict
108
+
109
+
110
+ def validate_date_args(date: Optional[str] = None) -> bool:
111
+ """Validate whether the provided date string satisfies any of the supported date formats.
112
+
113
+ Used by unstructured/ingest/connector/biomed.py
114
+
115
+ Returns `True` if the date string satisfies any of the supported formats, otherwise raises
116
+ `ValueError`.
117
+
118
+ Supported Date Formats:
119
+ - 'YYYY-MM-DD'
120
+ - 'YYYY-MM-DDTHH:MM:SS'
121
+ - 'YYYY-MM-DD+HH:MM:SS'
122
+ - 'YYYY-MM-DDTHH:MM:SS±HHMM'
123
+ """
124
+ if not date:
125
+ raise ValueError("The argument date is None.")
126
+
127
+ for format in DATE_FORMATS:
128
+ try:
129
+ datetime.strptime(date, format)
130
+ return True
131
+ except ValueError:
132
+ pass
133
+
134
+ raise ValueError(
135
+ f"The argument {date} does not satisfy the format:"
136
+ f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
137
+ )
138
+
139
+
140
+ def get_data_by_suffix(path: Path) -> list[dict]:
141
+ with path.open() as f:
142
+ if path.suffix == ".json":
143
+ return json.load(f)
144
+ elif path.suffix == ".ndjson":
145
+ return ndjson.load(f)
146
+ elif path.suffix == ".csv":
147
+ df = pd.read_csv(path)
148
+ return df.to_dict(orient="records")
149
+ elif path.suffix == ".parquet":
150
+ df = pd.read_parquet(path)
151
+ return df.to_dict(orient="records")
152
+ else:
153
+ raise ValueError(f"Unsupported file type: {path}")
154
+
155
+
156
+ def get_data(path: Path) -> list[dict]:
157
+ try:
158
+ return get_data_by_suffix(path=path)
159
+ except Exception as e:
160
+ logger.warning(f"failed to read {path} by extension: {e}")
161
+ # Fall back
162
+ with path.open() as f:
163
+ try:
164
+ return json.load(f)
165
+ except Exception as e:
166
+ logger.warning(f"failed to read {path} as json: {e}")
167
+ try:
168
+ return ndjson.load(f)
169
+ except Exception as e:
170
+ logger.warning(f"failed to read {path} as ndjson: {e}")
171
+ try:
172
+ df = pd.read_csv(path)
173
+ return df.to_dict(orient="records")
174
+ except Exception as e:
175
+ logger.warning(f"failed to read {path} as csv: {e}")
176
+ try:
177
+ df = pd.read_parquet(path)
178
+ return df.to_dict(orient="records")
179
+ except Exception as e:
180
+ logger.warning(f"failed to read {path} as parquet: {e}")
181
+
182
+ raise IOError(f"File could not be parsed: {path}")
183
+
184
+
185
+ def get_data_df(path: Path) -> pd.DataFrame:
186
+ with path.open() as f:
187
+ if path.suffix == ".json":
188
+ data = json.load(f)
189
+ return pd.DataFrame(data=data)
190
+ elif path.suffix == ".ndjson":
191
+ data = ndjson.load(f)
192
+ return pd.DataFrame(data=data)
193
+ elif path.suffix == ".csv":
194
+ df = pd.read_csv(path)
195
+ return df
196
+ elif path.suffix == ".parquet":
197
+ df = pd.read_parquet(path)
198
+ return df
199
+ else:
200
+ raise ValueError(f"Unsupported file type: {path}")