unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,332 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import random
5
+ import shutil
6
+ import tempfile
7
+ from contextlib import contextmanager
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
11
+ from uuid import NAMESPACE_DNS, uuid5
12
+
13
+ from pydantic import BaseModel, Field, Secret
14
+
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ Downloader,
19
+ DownloaderConfig,
20
+ DownloadResponse,
21
+ FileData,
22
+ FileDataSourceMetadata,
23
+ Indexer,
24
+ IndexerConfig,
25
+ SourceIdentifiers,
26
+ Uploader,
27
+ UploaderConfig,
28
+ )
29
+ from unstructured_ingest.v2.logger import logger
30
+ from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
31
+
32
+ if TYPE_CHECKING:
33
+ from fsspec import AbstractFileSystem
34
+
35
+ CONNECTOR_TYPE = "fsspec"
36
+
37
+
38
+ class FileConfig(BaseModel):
39
+ remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
40
+ protocol: str = Field(init=False)
41
+ path_without_protocol: str = Field(init=False)
42
+ supported_protocols: list[str] = Field(
43
+ init=False,
44
+ default_factory=lambda: [
45
+ "s3",
46
+ "s3a",
47
+ "abfs",
48
+ "az",
49
+ "gs",
50
+ "gcs",
51
+ "box",
52
+ "dropbox",
53
+ "sftp",
54
+ ],
55
+ )
56
+
57
+ def __init__(self, **data):
58
+ protocol, path_without_protocol = data["remote_url"].split("://")
59
+ data["protocol"] = protocol
60
+ data["path_without_protocol"] = path_without_protocol
61
+ super().__init__(**data)
62
+
63
+
64
+ class FsspecIndexerConfig(FileConfig, IndexerConfig):
65
+ recursive: bool = False
66
+ sample_n_files: Optional[int] = None
67
+
68
+
69
+ class FsspecAccessConfig(AccessConfig):
70
+ pass
71
+
72
+
73
+ class FsspecConnectionConfig(ConnectionConfig):
74
+ access_config: Secret[FsspecAccessConfig]
75
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
76
+
77
+ @contextmanager
78
+ def get_client(self, protocol: str) -> Generator["AbstractFileSystem", None, None]:
79
+ from fsspec import get_filesystem_class
80
+
81
+ client = get_filesystem_class(protocol)(
82
+ **self.get_access_config(),
83
+ )
84
+ yield client
85
+
86
+ def wrap_error(self, e: Exception) -> Exception:
87
+ return e
88
+
89
+
90
+ FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
91
+ FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnectionConfig)
92
+
93
+
94
+ @dataclass
95
+ class FsspecIndexer(Indexer):
96
+ connection_config: FsspecConnectionConfigT
97
+ index_config: FsspecIndexerConfigT
98
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
99
+
100
+ def wrap_error(self, e: Exception) -> Exception:
101
+ return self.connection_config.wrap_error(e=e)
102
+
103
+ def precheck(self) -> None:
104
+ from fsspec import get_filesystem_class
105
+
106
+ try:
107
+ fs = get_filesystem_class(self.index_config.protocol)(
108
+ **self.connection_config.get_access_config(),
109
+ )
110
+ files = fs.ls(path=self.index_config.path_without_protocol, detail=True)
111
+ valid_files = [x.get("name") for x in files if x.get("type") == "file"]
112
+ if not valid_files:
113
+ return
114
+ file_to_sample = valid_files[0]
115
+ logger.debug(f"attempting to make HEAD request for file: {file_to_sample}")
116
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
117
+ client.head(path=file_to_sample)
118
+ except Exception as e:
119
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
120
+ raise self.wrap_error(e=e)
121
+
122
+ def get_file_data(self) -> list[dict[str, Any]]:
123
+ if not self.index_config.recursive:
124
+ # fs.ls does not walk directories
125
+ # directories that are listed in cloud storage can cause problems
126
+ # because they are seen as 0 byte files
127
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
128
+ files = client.ls(self.index_config.path_without_protocol, detail=True)
129
+
130
+ else:
131
+ # fs.find will recursively walk directories
132
+ # "size" is a common key for all the cloud protocols with fs
133
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
134
+ found = client.find(
135
+ self.index_config.path_without_protocol,
136
+ detail=True,
137
+ )
138
+ files = found.values()
139
+ filtered_files = [
140
+ file for file in files if file.get("size") > 0 and file.get("type") == "file"
141
+ ]
142
+
143
+ if self.index_config.sample_n_files:
144
+ filtered_files = self.sample_n_files(filtered_files, self.index_config.sample_n_files)
145
+
146
+ return filtered_files
147
+
148
+ def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
149
+ if len(files) <= n:
150
+ logger.warning(
151
+ f"number of files to be sampled={n} is not smaller than the number"
152
+ f" of files found ({len(files)}). Returning all of the files as the"
153
+ " sample."
154
+ )
155
+ return files
156
+
157
+ return random.sample(files, n)
158
+
159
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
160
+ raise NotImplementedError()
161
+
162
+ def get_path(self, file_data: dict) -> str:
163
+ return file_data["name"]
164
+
165
+ def sterilize_info(self, file_data: dict) -> dict:
166
+ return sterilize_dict(data=file_data)
167
+
168
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
169
+ files = self.get_file_data()
170
+ for file_data in files:
171
+ file_path = self.get_path(file_data=file_data)
172
+ # Note: we remove any remaining leading slashes (Box introduces these)
173
+ # to get a valid relative path
174
+ rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
175
+
176
+ additional_metadata = self.sterilize_info(file_data=file_data)
177
+ additional_metadata["original_file_path"] = file_path
178
+ yield FileData(
179
+ identifier=str(uuid5(NAMESPACE_DNS, file_path)),
180
+ connector_type=self.connector_type,
181
+ source_identifiers=SourceIdentifiers(
182
+ filename=Path(file_path).name,
183
+ rel_path=rel_path or None,
184
+ fullpath=file_path,
185
+ ),
186
+ metadata=self.get_metadata(file_data=file_data),
187
+ additional_metadata=additional_metadata,
188
+ display_name=file_path,
189
+ )
190
+
191
+
192
+ class FsspecDownloaderConfig(DownloaderConfig):
193
+ pass
194
+
195
+
196
+ FsspecDownloaderConfigT = TypeVar("FsspecDownloaderConfigT", bound=FsspecDownloaderConfig)
197
+
198
+
199
+ @dataclass
200
+ class FsspecDownloader(Downloader):
201
+ protocol: str
202
+ connection_config: FsspecConnectionConfigT
203
+ connector_type: str = CONNECTOR_TYPE
204
+ download_config: Optional[FsspecDownloaderConfigT] = field(
205
+ default_factory=lambda: FsspecDownloaderConfig()
206
+ )
207
+
208
+ def is_async(self) -> bool:
209
+ with self.connection_config.get_client(protocol=self.protocol) as client:
210
+ return client.async_impl
211
+
212
+ def handle_directory_download(self, lpath: Path) -> None:
213
+ # If the object's name contains certain characters (i.e. '?'), it
214
+ # gets downloaded into a new directory of the same name. This
215
+ # reconciles that with what is expected, which is to download it
216
+ # as a file that is not within a directory.
217
+ if not lpath.is_dir():
218
+ return
219
+ desired_name = lpath.name
220
+ files_in_dir = [file for file in lpath.iterdir() if file.is_file()]
221
+ if not files_in_dir:
222
+ raise ValueError(f"no files in {lpath}")
223
+ if len(files_in_dir) > 1:
224
+ raise ValueError(
225
+ "Multiple files in {}: {}".format(lpath, ", ".join([str(f) for f in files_in_dir]))
226
+ )
227
+ file = files_in_dir[0]
228
+ with tempfile.TemporaryDirectory() as temp_dir:
229
+ temp_location = os.path.join(temp_dir, desired_name)
230
+ shutil.copyfile(src=file, dst=temp_location)
231
+ shutil.rmtree(lpath)
232
+ shutil.move(src=temp_location, dst=lpath)
233
+
234
+ def wrap_error(self, e: Exception) -> Exception:
235
+ return self.connection_config.wrap_error(e=e)
236
+
237
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
238
+ download_path = self.get_download_path(file_data=file_data)
239
+ download_path.parent.mkdir(parents=True, exist_ok=True)
240
+ try:
241
+ rpath = file_data.additional_metadata["original_file_path"]
242
+ with self.connection_config.get_client(protocol=self.protocol) as client:
243
+ client.get(rpath=rpath, lpath=download_path.as_posix())
244
+ self.handle_directory_download(lpath=download_path)
245
+ except Exception as e:
246
+ raise self.wrap_error(e=e)
247
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
248
+
249
+ async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
250
+ download_path = self.get_download_path(file_data=file_data)
251
+ download_path.parent.mkdir(parents=True, exist_ok=True)
252
+ try:
253
+ rpath = file_data.additional_metadata["original_file_path"]
254
+ with self.connection_config.get_client(protocol=self.protocol) as client:
255
+ await client.get(rpath=rpath, lpath=download_path.as_posix())
256
+ self.handle_directory_download(lpath=download_path)
257
+ except Exception as e:
258
+ raise self.wrap_error(e=e)
259
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
260
+
261
+
262
+ class FsspecUploaderConfig(FileConfig, UploaderConfig):
263
+ pass
264
+
265
+
266
+ FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
267
+
268
+
269
+ @dataclass
270
+ class FsspecUploader(Uploader):
271
+ connector_type: str = CONNECTOR_TYPE
272
+ upload_config: FsspecUploaderConfigT = field(default=None)
273
+ connection_config: FsspecConnectionConfigT
274
+
275
+ def is_async(self) -> bool:
276
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
277
+ return client.async_impl
278
+
279
+ @property
280
+ def fs(self) -> "AbstractFileSystem":
281
+ from fsspec import get_filesystem_class
282
+
283
+ fs_kwargs = self.connection_config.get_access_config() if self.connection_config else {}
284
+ return get_filesystem_class(self.upload_config.protocol)(
285
+ **fs_kwargs,
286
+ )
287
+
288
+ def __post_init__(self):
289
+ # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
290
+ if not self.upload_config:
291
+ raise TypeError(
292
+ f"{self.__class__.__name__}.__init__() "
293
+ f"missing 1 required positional argument: 'upload_config'"
294
+ )
295
+
296
+ def wrap_error(self, e: Exception) -> Exception:
297
+ return self.connection_config.wrap_error(e=e)
298
+
299
+ def precheck(self) -> None:
300
+ from fsspec import get_filesystem_class
301
+
302
+ try:
303
+ fs = get_filesystem_class(self.upload_config.protocol)(
304
+ **self.connection_config.get_access_config(),
305
+ )
306
+ upload_path = Path(self.upload_config.path_without_protocol) / "_empty"
307
+ fs.write_bytes(path=upload_path.as_posix(), value=b"")
308
+ except Exception as e:
309
+ raise self.wrap_error(e=e)
310
+
311
+ def get_upload_path(self, file_data: FileData) -> Path:
312
+ upload_path = (
313
+ Path(self.upload_config.path_without_protocol)
314
+ / file_data.source_identifiers.relative_path
315
+ )
316
+ updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
317
+ return updated_upload_path
318
+
319
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
320
+ path_str = str(path.resolve())
321
+ upload_path = self.get_upload_path(file_data=file_data)
322
+ logger.debug(f"writing local file {path_str} to {upload_path}")
323
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
324
+ client.upload(lpath=path_str, rpath=upload_path.as_posix())
325
+
326
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
327
+ upload_path = self.get_upload_path(file_data=file_data)
328
+ path_str = str(path.resolve())
329
+ # Odd that fsspec doesn't run exists() as async even when client support async
330
+ logger.debug(f"writing local file {path_str} to {upload_path}")
331
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
332
+ client.upload(lpath=path_str, rpath=upload_path.as_posix())
@@ -0,0 +1,197 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
8
+
9
+ from dateutil import parser
10
+ from pydantic import Field, Secret
11
+
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.utils.string_and_date_utils import json_to_dict
14
+ from unstructured_ingest.v2.errors import ProviderError, UserError
15
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
16
+ from unstructured_ingest.v2.logger import logger
17
+ from unstructured_ingest.v2.processes.connector_registry import (
18
+ DestinationRegistryEntry,
19
+ SourceRegistryEntry,
20
+ )
21
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
22
+ FsspecAccessConfig,
23
+ FsspecConnectionConfig,
24
+ FsspecDownloader,
25
+ FsspecDownloaderConfig,
26
+ FsspecIndexer,
27
+ FsspecIndexerConfig,
28
+ FsspecUploader,
29
+ FsspecUploaderConfig,
30
+ )
31
+
32
+ if TYPE_CHECKING:
33
+ from gcsfs import GCSFileSystem
34
+
35
+ CONNECTOR_TYPE = "gcs"
36
+
37
+
38
+ class GcsIndexerConfig(FsspecIndexerConfig):
39
+ pass
40
+
41
+
42
+ service_account_key_description = """
43
+ Options:
44
+ - ``None``, GCSFS will attempt to guess your credentials in the
45
+ following order: gcloud CLI default, gcsfs cached token, google compute
46
+ metadata service, anonymous.
47
+ - ``'google_default'``, your default gcloud credentials will be used,
48
+ which are typically established by doing ``gcloud login`` in a terminal.
49
+ - ``'cache'``, credentials from previously successful gcsfs
50
+ authentication will be used (use this after "browser" auth succeeded)
51
+ - ``'anon'``, no authentication is performed, and you can only
52
+ access data which is accessible to allUsers (in this case, the project and
53
+ access level parameters are meaningless)
54
+ - ``'browser'``, you get an access code with which you can
55
+ authenticate via a specially provided URL
56
+ - if ``'cloud'``, we assume we are running within google compute
57
+ or google container engine, and query the internal metadata directly for
58
+ a token.
59
+ - you may supply a token generated by the
60
+ [gcloud](https://cloud.google.com/sdk/docs/)
61
+ utility; this is either a python dictionary or the name of a file
62
+ containing the JSON returned by logging in with the gcloud CLI tool.
63
+ """
64
+
65
+
66
+ class GcsAccessConfig(FsspecAccessConfig):
67
+ service_account_key: Optional[str] = Field(
68
+ default=None, description=service_account_key_description
69
+ )
70
+ token: Union[str, dict, None] = Field(init=False, default=None)
71
+
72
+ def model_post_init(self, __context: Any) -> None:
73
+ ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
74
+
75
+ # Case: null value
76
+ if not self.service_account_key:
77
+ return
78
+
79
+ # Case: one of auth constants
80
+ if self.service_account_key in ALLOWED_AUTH_VALUES:
81
+ self.token = self.service_account_key
82
+ return
83
+
84
+ # Case: token as json
85
+ if isinstance(json_to_dict(self.service_account_key), dict):
86
+ self.token = json_to_dict(self.service_account_key)
87
+ return
88
+
89
+ # Case: path to token
90
+ if Path(self.service_account_key).is_file():
91
+ self.token = self.service_account_key
92
+ return
93
+
94
+ raise ValueError("Invalid auth token value")
95
+
96
+
97
+ class GcsConnectionConfig(FsspecConnectionConfig):
98
+ supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"], init=False)
99
+ access_config: Secret[GcsAccessConfig] = Field(default=GcsAccessConfig(), validate_default=True)
100
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
101
+
102
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
103
+ @contextmanager
104
+ def get_client(self, protocol: str) -> Generator["GCSFileSystem", None, None]:
105
+ with super().get_client(protocol=protocol) as client:
106
+ yield client
107
+
108
+ def wrap_error(self, e: Exception) -> Exception:
109
+ # https://github.com/fsspec/gcsfs/blob/main/gcsfs/retry.py#L79
110
+ from gcsfs.retry import HttpError
111
+
112
+ if isinstance(e, FileNotFoundError):
113
+ raise UserError(f"File not found: {e}")
114
+ if isinstance(e, OSError) and "Forbidden" in str(e):
115
+ raise UserError(e)
116
+ if isinstance(e, ValueError) and "Bad Request" in str(e):
117
+ raise UserError(e)
118
+ if isinstance(e, HttpError) and (http_error_code := e.code):
119
+ message = e.message or e
120
+ if 400 <= http_error_code < 500:
121
+ raise UserError(message)
122
+ if http_error_code >= 500:
123
+ raise ProviderError(message)
124
+ logger.error(f"unhandled exception from gcs ({type(e)}): {e}", exc_info=True)
125
+ return e
126
+
127
+
128
+ @dataclass
129
+ class GcsIndexer(FsspecIndexer):
130
+ connection_config: GcsConnectionConfig
131
+ index_config: GcsIndexerConfig
132
+ connector_type: str = CONNECTOR_TYPE
133
+
134
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
135
+ path = file_data["name"]
136
+ date_created = None
137
+ date_modified = None
138
+ if modified_at_str := file_data.get("updated"):
139
+ date_modified = str(parser.parse(modified_at_str).timestamp())
140
+ if created_at_str := file_data.get("timeCreated"):
141
+ date_created = str(parser.parse(created_at_str).timestamp())
142
+
143
+ file_size = file_data.get("size") if "size" in file_data else None
144
+
145
+ version = file_data.get("etag")
146
+ record_locator = {
147
+ "protocol": self.index_config.protocol,
148
+ "remote_file_path": self.index_config.remote_url,
149
+ "file_id": file_data.get("id"),
150
+ }
151
+ return FileDataSourceMetadata(
152
+ date_created=date_created,
153
+ date_modified=date_modified,
154
+ date_processed=str(time()),
155
+ version=version,
156
+ url=f"{self.index_config.protocol}://{path}",
157
+ record_locator=record_locator,
158
+ filesize_bytes=file_size,
159
+ )
160
+
161
+
162
+ class GcsDownloaderConfig(FsspecDownloaderConfig):
163
+ pass
164
+
165
+
166
+ @dataclass
167
+ class GcsDownloader(FsspecDownloader):
168
+ protocol: str = "gcs"
169
+ connection_config: GcsConnectionConfig
170
+ connector_type: str = CONNECTOR_TYPE
171
+ download_config: Optional[GcsDownloaderConfig] = field(default_factory=GcsDownloaderConfig)
172
+
173
+
174
+ class GcsUploaderConfig(FsspecUploaderConfig):
175
+ pass
176
+
177
+
178
+ @dataclass
179
+ class GcsUploader(FsspecUploader):
180
+ connector_type: str = CONNECTOR_TYPE
181
+ connection_config: GcsConnectionConfig
182
+ upload_config: GcsUploaderConfig = field(default=None)
183
+
184
+
185
+ gcs_source_entry = SourceRegistryEntry(
186
+ indexer=GcsIndexer,
187
+ indexer_config=GcsIndexerConfig,
188
+ downloader=GcsDownloader,
189
+ downloader_config=GcsDownloaderConfig,
190
+ connection_config=GcsConnectionConfig,
191
+ )
192
+
193
+ gcs_destination_entry = DestinationRegistryEntry(
194
+ uploader=GcsUploader,
195
+ uploader_config=GcsUploaderConfig,
196
+ connection_config=GcsConnectionConfig,
197
+ )