unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,44 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.interfaces.connector import AccessConfig
6
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
7
+ from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
8
+ LanceDBRemoteConnectionConfig,
9
+ LanceDBUploader,
10
+ LanceDBUploaderConfig,
11
+ LanceDBUploadStager,
12
+ LanceDBUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "lancedb_gcs"
16
+
17
+
18
+ class LanceDBGCSAccessConfig(AccessConfig):
19
+ google_service_account_key: str = Field(
20
+ description="The serialized google service account key."
21
+ )
22
+
23
+
24
+ class LanceDBGCSConnectionConfig(LanceDBRemoteConnectionConfig):
25
+ access_config: Secret[LanceDBGCSAccessConfig]
26
+
27
+ def get_storage_options(self) -> dict:
28
+ return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
29
+
30
+
31
+ @dataclass
32
+ class LanceDBGSPUploader(LanceDBUploader):
33
+ upload_config: LanceDBUploaderConfig
34
+ connection_config: LanceDBGCSConnectionConfig
35
+ connector_type: str = CONNECTOR_TYPE
36
+
37
+
38
+ lancedb_gcp_destination_entry = DestinationRegistryEntry(
39
+ connection_config=LanceDBGCSConnectionConfig,
40
+ uploader=LanceDBGSPUploader,
41
+ uploader_config=LanceDBUploaderConfig,
42
+ upload_stager_config=LanceDBUploadStagerConfig,
43
+ upload_stager=LanceDBUploadStager,
44
+ )
@@ -0,0 +1,169 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ from abc import ABC, abstractmethod
6
+ from contextlib import asynccontextmanager
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
10
+
11
+ import pandas as pd
12
+ from pydantic import Field
13
+
14
+ from unstructured_ingest.error import DestinationConnectionError
15
+ from unstructured_ingest.logger import logger
16
+ from unstructured_ingest.utils.data_prep import flatten_dict
17
+ from unstructured_ingest.utils.dep_check import requires_dependencies
18
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
19
+ from unstructured_ingest.v2.interfaces.connector import ConnectionConfig
20
+ from unstructured_ingest.v2.interfaces.file_data import FileData
21
+ from unstructured_ingest.v2.interfaces.upload_stager import UploadStager, UploadStagerConfig
22
+ from unstructured_ingest.v2.interfaces.uploader import Uploader, UploaderConfig
23
+
24
+ CONNECTOR_TYPE = "lancedb"
25
+
26
+ if TYPE_CHECKING:
27
+ from lancedb import AsyncConnection
28
+ from lancedb.table import AsyncTable
29
+
30
+
31
+ class LanceDBConnectionConfig(ConnectionConfig, ABC):
32
+ uri: str = Field(description="The uri of the database.")
33
+
34
+ @abstractmethod
35
+ def get_storage_options(self) -> Optional[dict[str, str]]:
36
+ raise NotImplementedError
37
+
38
+ @asynccontextmanager
39
+ @requires_dependencies(["lancedb"], extras="lancedb")
40
+ @DestinationConnectionError.wrap
41
+ async def get_async_connection(self) -> AsyncGenerator["AsyncConnection", None]:
42
+ import lancedb
43
+
44
+ with await lancedb.connect_async(
45
+ self.uri,
46
+ storage_options=self.get_storage_options(),
47
+ ) as connection:
48
+ yield connection
49
+
50
+
51
+ class LanceDBRemoteConnectionConfig(LanceDBConnectionConfig):
52
+ timeout: str = Field(
53
+ default="30s",
54
+ description=(
55
+ "Timeout for the entire request, from connection until the response body has finished"
56
+ "in a [0-9]+(ns|us|ms|[smhdwy]) format."
57
+ ),
58
+ pattern=r"[0-9]+(ns|us|ms|[smhdwy])",
59
+ )
60
+
61
+
62
+ class LanceDBUploadStagerConfig(UploadStagerConfig):
63
+ pass
64
+
65
+
66
+ @dataclass
67
+ class LanceDBUploadStager(UploadStager):
68
+ upload_stager_config: LanceDBUploadStagerConfig = field(
69
+ default_factory=LanceDBUploadStagerConfig
70
+ )
71
+
72
+ def run(
73
+ self,
74
+ elements_filepath: Path,
75
+ file_data: FileData,
76
+ output_dir: Path,
77
+ output_filename: str,
78
+ **kwargs: Any,
79
+ ) -> Path:
80
+ with open(elements_filepath) as elements_file:
81
+ elements_contents: list[dict] = json.load(elements_file)
82
+
83
+ df = pd.DataFrame(
84
+ [
85
+ self.conform_dict(element_dict=element_dict, file_data=file_data)
86
+ for element_dict in elements_contents
87
+ ]
88
+ )
89
+
90
+ output_path = (output_dir / output_filename).with_suffix(".feather")
91
+ df.to_feather(output_path)
92
+
93
+ return output_path
94
+
95
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
96
+ data = element_dict.copy()
97
+ return {
98
+ "vector": data.pop("embeddings", None),
99
+ RECORD_ID_LABEL: file_data.identifier,
100
+ **flatten_dict(data, separator="-"),
101
+ }
102
+
103
+
104
+ class LanceDBUploaderConfig(UploaderConfig):
105
+ table_name: str = Field(description="The name of the table.")
106
+
107
+
108
+ @dataclass
109
+ class LanceDBUploader(Uploader):
110
+ upload_config: LanceDBUploaderConfig
111
+ connection_config: LanceDBConnectionConfig
112
+ connector_type: str = CONNECTOR_TYPE
113
+
114
+ @DestinationConnectionError.wrap
115
+ def precheck(self):
116
+ async def _precheck() -> None:
117
+ async with self.connection_config.get_async_connection() as conn:
118
+ table = await conn.open_table(self.upload_config.table_name)
119
+ table.close()
120
+
121
+ asyncio.run(_precheck())
122
+
123
+ @asynccontextmanager
124
+ async def get_table(self) -> AsyncGenerator["AsyncTable", None]:
125
+ async with self.connection_config.get_async_connection() as conn:
126
+ table = await conn.open_table(self.upload_config.table_name)
127
+ try:
128
+ yield table
129
+ finally:
130
+ table.close()
131
+
132
+ async def run_async(self, path, file_data, **kwargs):
133
+ df = pd.read_feather(path)
134
+ async with self.get_table() as table:
135
+ schema = await table.schema()
136
+ df = self._fit_to_schema(df, schema)
137
+ if RECORD_ID_LABEL not in schema.names:
138
+ logger.warning(
139
+ f"Designated table doesn't contain {RECORD_ID_LABEL} column of type"
140
+ " string which is required to support overwriting updates on subsequent"
141
+ " uploads of the same record. New rows will be appended instead."
142
+ )
143
+ else:
144
+ await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
145
+ await table.add(data=df)
146
+
147
+ def _fit_to_schema(self, df: pd.DataFrame, schema) -> pd.DataFrame:
148
+ columns = set(df.columns)
149
+ schema_fields = set(schema.names)
150
+ columns_to_drop = columns - schema_fields
151
+ missing_columns = schema_fields - columns
152
+
153
+ if columns_to_drop:
154
+ logger.info(
155
+ "Following columns will be dropped to match the table's schema: "
156
+ f"{', '.join(columns_to_drop)}"
157
+ )
158
+ if missing_columns:
159
+ logger.info(
160
+ "Following null filled columns will be added to match the table's schema:"
161
+ f" {', '.join(missing_columns)} "
162
+ )
163
+
164
+ df = df.drop(columns=columns_to_drop)
165
+
166
+ for column in missing_columns:
167
+ df[column] = pd.Series()
168
+
169
+ return df
@@ -0,0 +1,44 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.interfaces.connector import AccessConfig
6
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
7
+ from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
8
+ LanceDBConnectionConfig,
9
+ LanceDBUploader,
10
+ LanceDBUploaderConfig,
11
+ LanceDBUploadStager,
12
+ LanceDBUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "lancedb_local"
16
+
17
+
18
+ class LanceDBLocalAccessConfig(AccessConfig):
19
+ pass
20
+
21
+
22
+ class LanceDBLocalConnectionConfig(LanceDBConnectionConfig):
23
+ access_config: Secret[LanceDBLocalAccessConfig] = Field(
24
+ default_factory=LanceDBLocalAccessConfig, validate_default=True
25
+ )
26
+
27
+ def get_storage_options(self) -> None:
28
+ return None
29
+
30
+
31
+ @dataclass
32
+ class LanceDBLocalUploader(LanceDBUploader):
33
+ upload_config: LanceDBUploaderConfig
34
+ connection_config: LanceDBLocalConnectionConfig
35
+ connector_type: str = CONNECTOR_TYPE
36
+
37
+
38
+ lancedb_local_destination_entry = DestinationRegistryEntry(
39
+ connection_config=LanceDBLocalConnectionConfig,
40
+ uploader=LanceDBLocalUploader,
41
+ uploader_config=LanceDBUploaderConfig,
42
+ upload_stager_config=LanceDBUploadStagerConfig,
43
+ upload_stager=LanceDBUploadStager,
44
+ )
@@ -0,0 +1,217 @@
1
+ import glob
2
+ import json
3
+ import shutil
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import Any, Generator
8
+
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.v2.interfaces import (
12
+ AccessConfig,
13
+ ConnectionConfig,
14
+ Downloader,
15
+ DownloaderConfig,
16
+ DownloadResponse,
17
+ FileData,
18
+ FileDataSourceMetadata,
19
+ Indexer,
20
+ IndexerConfig,
21
+ SourceIdentifiers,
22
+ Uploader,
23
+ UploaderConfig,
24
+ )
25
+ from unstructured_ingest.v2.logger import logger
26
+ from unstructured_ingest.v2.processes.connector_registry import (
27
+ DestinationRegistryEntry,
28
+ SourceRegistryEntry,
29
+ )
30
+
31
+ CONNECTOR_TYPE = "local"
32
+
33
+
34
+ class LocalAccessConfig(AccessConfig):
35
+ pass
36
+
37
+
38
+ class LocalConnectionConfig(ConnectionConfig):
39
+ access_config: Secret[LocalAccessConfig] = Field(
40
+ default=LocalAccessConfig(), validate_default=True
41
+ )
42
+
43
+
44
+ class LocalIndexerConfig(IndexerConfig):
45
+ input_path: Path = Field(
46
+ description="Path to the location in the local file system that will be processed."
47
+ )
48
+ recursive: bool = Field(
49
+ default=False,
50
+ description="Recursively download files in their respective folders "
51
+ "otherwise stop at the files in provided folder level.",
52
+ )
53
+
54
+ @property
55
+ def path(self) -> Path:
56
+ return Path(self.input_path).resolve()
57
+
58
+
59
+ @dataclass
60
+ class LocalIndexer(Indexer):
61
+ index_config: LocalIndexerConfig
62
+ connection_config: LocalConnectionConfig = field(
63
+ default_factory=lambda: LocalConnectionConfig()
64
+ )
65
+ connector_type: str = CONNECTOR_TYPE
66
+
67
+ def list_files(self) -> list[Path]:
68
+ input_path = self.index_config.path
69
+ if input_path.is_file():
70
+ return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
71
+ files = []
72
+ if self.index_config.recursive:
73
+ files.extend(list(input_path.rglob("*")))
74
+ else:
75
+ files.extend(list(input_path.glob("*")))
76
+ return [f for f in files if f.is_file()]
77
+
78
+ def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
79
+ stats = path.stat()
80
+ try:
81
+ date_modified = str(stats.st_mtime)
82
+ except Exception as e:
83
+ logger.warning(f"Couldn't detect date modified: {e}")
84
+ date_modified = None
85
+
86
+ try:
87
+ date_created = str(stats.st_birthtime)
88
+ except Exception as e:
89
+ logger.warning(f"Couldn't detect date created: {e}")
90
+ date_created = None
91
+
92
+ try:
93
+ mode = stats.st_mode
94
+ permissions_data = [{"mode": mode}]
95
+ except Exception as e:
96
+ logger.warning(f"Couldn't detect file mode: {e}")
97
+ permissions_data = None
98
+
99
+ try:
100
+ filesize_bytes = stats.st_size
101
+ except Exception as e:
102
+ logger.warning(f"Couldn't detect file size: {e}")
103
+ filesize_bytes = None
104
+
105
+ return FileDataSourceMetadata(
106
+ date_modified=date_modified,
107
+ date_created=date_created,
108
+ date_processed=str(time()),
109
+ permissions_data=permissions_data,
110
+ record_locator={"path": str(path.resolve())},
111
+ filesize_bytes=filesize_bytes,
112
+ )
113
+
114
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
115
+ for file_path in self.list_files():
116
+ file_data = FileData(
117
+ identifier=str(file_path.resolve()),
118
+ connector_type=CONNECTOR_TYPE,
119
+ source_identifiers=SourceIdentifiers(
120
+ fullpath=str(file_path.resolve()),
121
+ filename=file_path.name,
122
+ rel_path=(
123
+ str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[
124
+ 1:
125
+ ]
126
+ if not self.index_config.path.is_file()
127
+ else self.index_config.path.name
128
+ ),
129
+ ),
130
+ metadata=self.get_file_metadata(path=file_path),
131
+ )
132
+ yield file_data
133
+
134
+
135
+ class LocalDownloaderConfig(DownloaderConfig):
136
+ pass
137
+
138
+
139
+ @dataclass
140
+ class LocalDownloader(Downloader):
141
+ connector_type: str = CONNECTOR_TYPE
142
+ connection_config: LocalConnectionConfig = field(default_factory=LocalConnectionConfig)
143
+ download_config: LocalDownloaderConfig = field(default_factory=LocalDownloaderConfig)
144
+
145
+ def get_download_path(self, file_data: FileData) -> Path:
146
+ return Path(file_data.source_identifiers.fullpath)
147
+
148
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
149
+ return DownloadResponse(
150
+ file_data=file_data, path=Path(file_data.source_identifiers.fullpath)
151
+ )
152
+
153
+
154
+ class LocalUploaderConfig(UploaderConfig):
155
+ output_dir: str = Field(
156
+ default="structured-output", description="Local path to write partitioned output to"
157
+ )
158
+
159
+ @property
160
+ def output_path(self) -> Path:
161
+ return Path(self.output_dir).resolve()
162
+
163
+ def __post_init__(self):
164
+ if self.output_path.exists() and self.output_path.is_file():
165
+ raise ValueError("output path already exists as a file")
166
+
167
+
168
+ @dataclass
169
+ class LocalUploader(Uploader):
170
+ connector_type: str = CONNECTOR_TYPE
171
+ upload_config: LocalUploaderConfig = field(default_factory=LocalUploaderConfig)
172
+ connection_config: LocalConnectionConfig = field(
173
+ default_factory=lambda: LocalConnectionConfig()
174
+ )
175
+
176
+ def is_async(self) -> bool:
177
+ return False
178
+
179
+ def get_destination_path(self, file_data: FileData) -> Path:
180
+ if source_identifiers := file_data.source_identifiers:
181
+ rel_path = (
182
+ source_identifiers.relative_path[1:]
183
+ if source_identifiers.relative_path.startswith("/")
184
+ else source_identifiers.relative_path
185
+ )
186
+ new_path = self.upload_config.output_path / Path(rel_path)
187
+ final_path = str(new_path).replace(
188
+ source_identifiers.filename, f"{source_identifiers.filename}.json"
189
+ )
190
+ else:
191
+ final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
192
+ final_path = Path(final_path)
193
+ final_path.parent.mkdir(parents=True, exist_ok=True)
194
+ return final_path
195
+
196
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
197
+ final_path = self.get_destination_path(file_data=file_data)
198
+ with final_path.open("w") as f:
199
+ json.dump(data, f)
200
+
201
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
202
+ final_path = self.get_destination_path(file_data=file_data)
203
+ logger.debug(f"copying file from {path} to {final_path}")
204
+ shutil.copy(src=str(path), dst=str(final_path))
205
+
206
+
207
+ local_source_entry = SourceRegistryEntry(
208
+ indexer=LocalIndexer,
209
+ indexer_config=LocalIndexerConfig,
210
+ downloader=LocalDownloader,
211
+ downloader_config=LocalDownloaderConfig,
212
+ connection_config=LocalConnectionConfig,
213
+ )
214
+
215
+ local_destination_entry = DestinationRegistryEntry(
216
+ uploader=LocalUploader, uploader_config=LocalUploaderConfig
217
+ )
@@ -0,0 +1,225 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
5
+
6
+ from dateutil import parser
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
10
+ from unstructured_ingest.utils.data_prep import flatten_dict
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
13
+ from unstructured_ingest.v2.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ FileData,
17
+ Uploader,
18
+ UploaderConfig,
19
+ UploadStager,
20
+ UploadStagerConfig,
21
+ )
22
+ from unstructured_ingest.v2.logger import logger
23
+ from unstructured_ingest.v2.processes.connector_registry import (
24
+ DestinationRegistryEntry,
25
+ )
26
+
27
+ if TYPE_CHECKING:
28
+ from pymilvus import MilvusClient
29
+
30
+ CONNECTOR_TYPE = "milvus"
31
+
32
+
33
+ class MilvusAccessConfig(AccessConfig):
34
+ password: Optional[str] = Field(default=None, description="Milvus password")
35
+ token: Optional[str] = Field(default=None, description="Milvus access token")
36
+
37
+
38
+ class MilvusConnectionConfig(ConnectionConfig):
39
+ access_config: Secret[MilvusAccessConfig] = Field(
40
+ default=MilvusAccessConfig(), validate_default=True
41
+ )
42
+ uri: Optional[str] = Field(
43
+ default=None, description="Milvus uri", examples=["http://localhost:19530"]
44
+ )
45
+ user: Optional[str] = Field(default=None, description="Milvus user")
46
+ db_name: Optional[str] = Field(default=None, description="Milvus database name")
47
+
48
+ def get_connection_kwargs(self) -> dict[str, Any]:
49
+ access_config = self.access_config.get_secret_value()
50
+ access_config_dict = access_config.model_dump()
51
+ connection_config_dict = self.model_dump()
52
+ connection_config_dict.pop("access_config", None)
53
+ connection_config_dict.update(access_config_dict)
54
+ # Drop any that were not set explicitly
55
+ connection_config_dict = {k: v for k, v in connection_config_dict.items() if v is not None}
56
+ return connection_config_dict
57
+
58
+ @requires_dependencies(["pymilvus"], extras="milvus")
59
+ @contextmanager
60
+ def get_client(self) -> Generator["MilvusClient", None, None]:
61
+ from pymilvus import MilvusClient
62
+
63
+ client = None
64
+ try:
65
+ client = MilvusClient(**self.get_connection_kwargs())
66
+ yield client
67
+ finally:
68
+ if client:
69
+ client.close()
70
+
71
+
72
+ class MilvusUploadStagerConfig(UploadStagerConfig):
73
+ fields_to_include: Optional[list[str]] = None
74
+ """If set - list of fields to include in the output.
75
+ Unspecified fields are removed from the elements.
76
+ This action takes place after metadata flattening.
77
+ Missing fields will cause stager to throw KeyError."""
78
+
79
+ flatten_metadata: bool = True
80
+ """If set - flatten "metadata" key and put contents directly into data"""
81
+
82
+
83
+ @dataclass
84
+ class MilvusUploadStager(UploadStager):
85
+ upload_stager_config: MilvusUploadStagerConfig = field(
86
+ default_factory=lambda: MilvusUploadStagerConfig()
87
+ )
88
+
89
+ @staticmethod
90
+ def parse_date_string(date_string: str) -> float:
91
+ try:
92
+ timestamp = float(date_string)
93
+ return timestamp
94
+ except ValueError:
95
+ pass
96
+ return parser.parse(date_string).timestamp()
97
+
98
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
99
+ working_data = element_dict.copy()
100
+ if self.upload_stager_config.flatten_metadata and (
101
+ metadata := working_data.pop("metadata", None)
102
+ ):
103
+ working_data.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
104
+
105
+ # TODO: milvus sdk doesn't seem to support defaults via the schema yet,
106
+ # remove once that gets updated
107
+ defaults = {"is_continuation": False}
108
+ for default in defaults:
109
+ if default not in working_data:
110
+ working_data[default] = defaults[default]
111
+
112
+ if self.upload_stager_config.fields_to_include:
113
+ data_keys = set(working_data.keys())
114
+ for data_key in data_keys:
115
+ if data_key not in self.upload_stager_config.fields_to_include:
116
+ working_data.pop(data_key)
117
+ for field_include_key in self.upload_stager_config.fields_to_include:
118
+ if field_include_key not in working_data:
119
+ raise KeyError(f"Field '{field_include_key}' is missing in data!")
120
+
121
+ datetime_columns = [
122
+ "data_source_date_created",
123
+ "data_source_date_modified",
124
+ "data_source_date_processed",
125
+ "last_modified",
126
+ ]
127
+
128
+ json_dumps_fields = ["languages", "data_source_permissions_data"]
129
+
130
+ for datetime_column in datetime_columns:
131
+ if datetime_column in working_data:
132
+ working_data[datetime_column] = self.parse_date_string(
133
+ working_data[datetime_column]
134
+ )
135
+ for json_dumps_field in json_dumps_fields:
136
+ if json_dumps_field in working_data:
137
+ working_data[json_dumps_field] = json.dumps(working_data[json_dumps_field])
138
+ working_data[RECORD_ID_LABEL] = file_data.identifier
139
+ return working_data
140
+
141
+
142
+ class MilvusUploaderConfig(UploaderConfig):
143
+ db_name: Optional[str] = Field(default=None, description="Milvus database name")
144
+ collection_name: str = Field(description="Milvus collections to write to")
145
+ record_id_key: str = Field(
146
+ default=RECORD_ID_LABEL,
147
+ description="searchable key to find entries for the same record on previous runs",
148
+ )
149
+
150
+
151
+ @dataclass
152
+ class MilvusUploader(Uploader):
153
+ connection_config: MilvusConnectionConfig
154
+ upload_config: MilvusUploaderConfig
155
+ connector_type: str = CONNECTOR_TYPE
156
+
157
+ @DestinationConnectionError.wrap
158
+ def precheck(self):
159
+ from pymilvus import MilvusException
160
+
161
+ try:
162
+ with self.get_client() as client:
163
+ if not client.has_collection(self.upload_config.collection_name):
164
+ raise DestinationConnectionError(
165
+ f"Collection '{self.upload_config.collection_name}' does not exist"
166
+ )
167
+ except MilvusException as milvus_exception:
168
+ raise DestinationConnectionError(
169
+ f"failed to precheck Milvus: {str(milvus_exception.message)}"
170
+ ) from milvus_exception
171
+
172
+ @contextmanager
173
+ def get_client(self) -> Generator["MilvusClient", None, None]:
174
+ with self.connection_config.get_client() as client:
175
+ if db_name := self.upload_config.db_name:
176
+ client.using_database(db_name=db_name)
177
+ yield client
178
+
179
+ def delete_by_record_id(self, file_data: FileData) -> None:
180
+ logger.info(
181
+ f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
182
+ f"from milvus collection {self.upload_config.collection_name}"
183
+ )
184
+ with self.get_client() as client:
185
+ delete_filter = f'{self.upload_config.record_id_key} == "{file_data.identifier}"'
186
+ resp = client.delete(
187
+ collection_name=self.upload_config.collection_name, filter=delete_filter
188
+ )
189
+ logger.info(
190
+ "deleted {} records from milvus collection {}".format(
191
+ resp["delete_count"], self.upload_config.collection_name
192
+ )
193
+ )
194
+
195
+ @requires_dependencies(["pymilvus"], extras="milvus")
196
+ def insert_results(self, data: Union[dict, list[dict]]):
197
+ from pymilvus import MilvusException
198
+
199
+ logger.info(
200
+ f"uploading {len(data)} entries to {self.connection_config.db_name} "
201
+ f"db in collection {self.upload_config.collection_name}"
202
+ )
203
+ with self.get_client() as client:
204
+ try:
205
+ res = client.insert(collection_name=self.upload_config.collection_name, data=data)
206
+ except MilvusException as milvus_exception:
207
+ raise WriteError(
208
+ f"failed to upload records to Milvus: {str(milvus_exception.message)}"
209
+ ) from milvus_exception
210
+ if "err_count" in res and isinstance(res["err_count"], int) and res["err_count"] > 0:
211
+ err_count = res["err_count"]
212
+ raise WriteError(f"failed to upload {err_count} docs")
213
+
214
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
215
+ self.delete_by_record_id(file_data=file_data)
216
+ self.insert_results(data=data)
217
+
218
+
219
+ milvus_destination_entry = DestinationRegistryEntry(
220
+ connection_config=MilvusConnectionConfig,
221
+ uploader=MilvusUploader,
222
+ uploader_config=MilvusUploaderConfig,
223
+ upload_stager=MilvusUploadStager,
224
+ upload_stager_config=MilvusUploadStagerConfig,
225
+ )