unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,185 @@
1
+ import contextlib
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from time import time
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
10
+ from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
11
+ from unstructured_ingest.v2.interfaces import (
12
+ FileDataSourceMetadata,
13
+ )
14
+ from unstructured_ingest.v2.logger import logger
15
+ from unstructured_ingest.v2.processes.connector_registry import (
16
+ DestinationRegistryEntry,
17
+ SourceRegistryEntry,
18
+ )
19
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
20
+ FsspecAccessConfig,
21
+ FsspecConnectionConfig,
22
+ FsspecDownloader,
23
+ FsspecDownloaderConfig,
24
+ FsspecIndexer,
25
+ FsspecIndexerConfig,
26
+ FsspecUploader,
27
+ FsspecUploaderConfig,
28
+ )
29
+
30
+ CONNECTOR_TYPE = "s3"
31
+
32
+ if TYPE_CHECKING:
33
+ from s3fs import S3FileSystem
34
+
35
+
36
+ class S3IndexerConfig(FsspecIndexerConfig):
37
+ pass
38
+
39
+
40
+ class S3AccessConfig(FsspecAccessConfig):
41
+ key: Optional[str] = Field(
42
+ default=None,
43
+ description="If not anonymous, use this access key ID, if specified. Takes precedence "
44
+ "over `aws_access_key_id` in client_kwargs.",
45
+ )
46
+ secret: Optional[str] = Field(
47
+ default=None, description="If not anonymous, use this secret access key, if specified."
48
+ )
49
+ token: Optional[str] = Field(
50
+ default=None, description="If not anonymous, use this security token, if specified."
51
+ )
52
+
53
+
54
+ class S3ConnectionConfig(FsspecConnectionConfig):
55
+ supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
56
+ access_config: Secret[S3AccessConfig] = Field(default=S3AccessConfig(), validate_default=True)
57
+ endpoint_url: Optional[str] = Field(
58
+ default=None,
59
+ description="Use this endpoint_url, if specified. Needed for "
60
+ "connecting to non-AWS S3 buckets.",
61
+ )
62
+ anonymous: bool = Field(
63
+ default=False, description="Connect to s3 without local AWS credentials."
64
+ )
65
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
66
+
67
+ def get_access_config(self) -> dict[str, Any]:
68
+ access_configs: dict[str, Any] = {"anon": self.anonymous}
69
+ if self.endpoint_url:
70
+ access_configs["endpoint_url"] = self.endpoint_url
71
+
72
+ # Avoid injecting None by filtering out k,v pairs where the value is None
73
+ access_configs.update(
74
+ {k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v}
75
+ )
76
+ return access_configs
77
+
78
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
79
+ @contextmanager
80
+ def get_client(self, protocol: str) -> Generator["S3FileSystem", None, None]:
81
+ with super().get_client(protocol=protocol) as client:
82
+ yield client
83
+
84
+ def wrap_error(self, e: Exception) -> Exception:
85
+ # s3fs maps botocore errors into python ones using mapping here:
86
+ # https://github.com/fsspec/s3fs/blob/main/s3fs/errors.py
87
+ if isinstance(e, PermissionError):
88
+ return UserAuthError(e)
89
+ if isinstance(e, FileNotFoundError):
90
+ return UserError(e)
91
+ if cause := getattr(e, "__cause__", None):
92
+ error_response = cause.response
93
+ error_meta = error_response["ResponseMetadata"]
94
+ http_code = error_meta["HTTPStatusCode"]
95
+ message = error_response["Error"].get("Message", str(e))
96
+ if 400 <= http_code < 500:
97
+ return UserError(message)
98
+ if http_code >= 500:
99
+ return ProviderError(message)
100
+ logger.error(f"unhandled exception from s3 ({type(e)}): {e}", exc_info=True)
101
+ return e
102
+
103
+
104
+ @dataclass
105
+ class S3Indexer(FsspecIndexer):
106
+ connection_config: S3ConnectionConfig
107
+ index_config: S3IndexerConfig
108
+ connector_type: str = CONNECTOR_TYPE
109
+
110
+ def wrap_error(self, e: Exception) -> Exception:
111
+ return self.connection_config.wrap_error(e=e)
112
+
113
+ def get_path(self, file_data: dict) -> str:
114
+ return file_data["Key"]
115
+
116
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
117
+ path = file_data["Key"]
118
+ date_created = None
119
+ date_modified = None
120
+ modified = file_data.get("LastModified")
121
+ if modified:
122
+ date_created = str(modified.timestamp())
123
+ date_modified = str(modified.timestamp())
124
+
125
+ file_size = file_data.get("size") if "size" in file_data else None
126
+ file_size = file_size or file_data.get("Size")
127
+
128
+ version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
129
+ metadata: dict[str, str] = {}
130
+ with contextlib.suppress(AttributeError):
131
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
132
+ metadata = client.metadata(path=path)
133
+ record_locator = {
134
+ "protocol": self.index_config.protocol,
135
+ "remote_file_path": self.index_config.remote_url,
136
+ }
137
+ if metadata:
138
+ record_locator["metadata"] = metadata
139
+ return FileDataSourceMetadata(
140
+ date_created=date_created,
141
+ date_modified=date_modified,
142
+ date_processed=str(time()),
143
+ version=version,
144
+ url=f"{self.index_config.protocol}://{path}",
145
+ record_locator=record_locator,
146
+ filesize_bytes=file_size,
147
+ )
148
+
149
+
150
+ class S3DownloaderConfig(FsspecDownloaderConfig):
151
+ pass
152
+
153
+
154
+ @dataclass
155
+ class S3Downloader(FsspecDownloader):
156
+ protocol: str = "s3"
157
+ connection_config: S3ConnectionConfig
158
+ connector_type: str = CONNECTOR_TYPE
159
+ download_config: Optional[S3DownloaderConfig] = field(default_factory=S3DownloaderConfig)
160
+
161
+
162
+ class S3UploaderConfig(FsspecUploaderConfig):
163
+ pass
164
+
165
+
166
+ @dataclass
167
+ class S3Uploader(FsspecUploader):
168
+ connector_type: str = CONNECTOR_TYPE
169
+ connection_config: S3ConnectionConfig
170
+ upload_config: S3UploaderConfig = field(default=None)
171
+
172
+
173
+ s3_source_entry = SourceRegistryEntry(
174
+ indexer=S3Indexer,
175
+ indexer_config=S3IndexerConfig,
176
+ downloader=S3Downloader,
177
+ downloader_config=S3DownloaderConfig,
178
+ connection_config=S3ConnectionConfig,
179
+ )
180
+
181
+ s3_destination_entry = DestinationRegistryEntry(
182
+ uploader=S3Uploader,
183
+ uploader_config=S3UploaderConfig,
184
+ connection_config=S3ConnectionConfig,
185
+ )
@@ -0,0 +1,171 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from contextlib import contextmanager
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from time import time
8
+ from typing import TYPE_CHECKING, Any, Generator, Optional
9
+ from urllib.parse import urlparse
10
+
11
+ from pydantic import Field, Secret
12
+
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+ from unstructured_ingest.v2.interfaces import FileData, FileDataSourceMetadata
15
+ from unstructured_ingest.v2.processes.connector_registry import (
16
+ DestinationRegistryEntry,
17
+ SourceRegistryEntry,
18
+ )
19
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
20
+ FsspecAccessConfig,
21
+ FsspecConnectionConfig,
22
+ FsspecDownloader,
23
+ FsspecDownloaderConfig,
24
+ FsspecIndexer,
25
+ FsspecIndexerConfig,
26
+ FsspecUploader,
27
+ FsspecUploaderConfig,
28
+ )
29
+
30
+ if TYPE_CHECKING:
31
+ from fsspec.implementations.sftp import SFTPFileSystem
32
+
33
+ CONNECTOR_TYPE = "sftp"
34
+
35
+
36
+ class SftpIndexerConfig(FsspecIndexerConfig):
37
+ def model_post_init(self, __context: Any) -> None:
38
+ super().model_post_init(__context)
39
+ _, ext = os.path.splitext(self.remote_url)
40
+ parsed_url = urlparse(self.remote_url)
41
+ if ext:
42
+ self.path_without_protocol = Path(parsed_url.path).parent.as_posix().lstrip("/")
43
+ else:
44
+ self.path_without_protocol = parsed_url.path.lstrip("/")
45
+
46
+
47
+ class SftpAccessConfig(FsspecAccessConfig):
48
+ password: str = Field(description="Password for sftp connection")
49
+
50
+
51
+ class SftpConnectionConfig(FsspecConnectionConfig):
52
+ supported_protocols: list[str] = Field(default_factory=lambda: ["sftp"], init=False)
53
+ access_config: Secret[SftpAccessConfig]
54
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
55
+ username: str = Field(description="Username for sftp connection")
56
+ host: Optional[str] = Field(default=None, description="Hostname for sftp connection")
57
+ port: int = Field(default=22, description="Port for sftp connection")
58
+ look_for_keys: bool = Field(
59
+ default=False, description="Whether to search for private key files in ~/.ssh/"
60
+ )
61
+ allow_agent: bool = Field(default=False, description="Whether to connect to the SSH agent.")
62
+
63
+ def get_access_config(self) -> dict[str, Any]:
64
+ access_config = {
65
+ "username": self.username,
66
+ "host": self.host,
67
+ "port": self.port,
68
+ "look_for_keys": self.look_for_keys,
69
+ "allow_agent": self.allow_agent,
70
+ "password": self.access_config.get_secret_value().password,
71
+ }
72
+ return access_config
73
+
74
+ @contextmanager
75
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
76
+ def get_client(self, protocol: str) -> Generator["SFTPFileSystem", None, None]:
77
+ # The paramiko.SSHClient() client that's opened by the SFTPFileSystem
78
+ # never gets closed so explicitly adding that as part of this context manager
79
+ from fsspec import get_filesystem_class
80
+
81
+ client: SFTPFileSystem = get_filesystem_class(protocol)(
82
+ **self.get_access_config(),
83
+ )
84
+ yield client
85
+ client.client.close()
86
+
87
+
88
+ @dataclass
89
+ class SftpIndexer(FsspecIndexer):
90
+ connection_config: SftpConnectionConfig
91
+ index_config: SftpIndexerConfig
92
+ connector_type: str = CONNECTOR_TYPE
93
+
94
+ def __post_init__(self):
95
+ parsed_url = urlparse(self.index_config.remote_url)
96
+ self.connection_config.host = parsed_url.hostname or self.connection_config.host
97
+ self.connection_config.port = parsed_url.port or self.connection_config.port
98
+
99
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
100
+ for file in super().run(**kwargs):
101
+ new_identifier = (
102
+ f"sftp://"
103
+ f"{self.connection_config.host}:"
104
+ f"{self.connection_config.port}/"
105
+ f"{file.identifier}"
106
+ )
107
+ file.identifier = new_identifier
108
+ yield file
109
+
110
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
111
+ path = file_data["name"]
112
+ date_created = str(file_data.get("time").timestamp()) if "time" in file_data else None
113
+ date_modified = str(file_data.get("mtime").timestamp()) if "mtime" in file_data else None
114
+
115
+ file_size = file_data.get("size") if "size" in file_data else None
116
+
117
+ record_locator = {
118
+ "protocol": self.index_config.protocol,
119
+ "remote_file_path": self.index_config.remote_url,
120
+ }
121
+ return FileDataSourceMetadata(
122
+ date_created=date_created,
123
+ date_modified=date_modified,
124
+ date_processed=str(time()),
125
+ url=f"{self.index_config.protocol}://{path}",
126
+ record_locator=record_locator,
127
+ filesize_bytes=file_size,
128
+ )
129
+
130
+
131
+ class SftpDownloaderConfig(FsspecDownloaderConfig):
132
+ remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
133
+
134
+
135
+ @dataclass
136
+ class SftpDownloader(FsspecDownloader):
137
+ protocol: str = "sftp"
138
+ connection_config: SftpConnectionConfig
139
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
140
+ download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
141
+
142
+ def __post_init__(self):
143
+ parsed_url = urlparse(self.download_config.remote_url)
144
+ self.connection_config.host = parsed_url.hostname or self.connection_config.host
145
+ self.connection_config.port = parsed_url.port or self.connection_config.port
146
+
147
+
148
+ class SftpUploaderConfig(FsspecUploaderConfig):
149
+ pass
150
+
151
+
152
+ @dataclass
153
+ class SftpUploader(FsspecUploader):
154
+ connector_type: str = CONNECTOR_TYPE
155
+ connection_config: SftpConnectionConfig
156
+ upload_config: SftpUploaderConfig = field(default=None)
157
+
158
+
159
+ sftp_source_entry = SourceRegistryEntry(
160
+ indexer=SftpIndexer,
161
+ indexer_config=SftpIndexerConfig,
162
+ downloader=SftpDownloader,
163
+ downloader_config=SftpDownloaderConfig,
164
+ connection_config=SftpConnectionConfig,
165
+ )
166
+
167
+ sftp_destination_entry = DestinationRegistryEntry(
168
+ uploader=SftpUploader,
169
+ uploader_config=SftpUploaderConfig,
170
+ connection_config=SftpConnectionConfig,
171
+ )
@@ -0,0 +1,17 @@
1
+ import json
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from typing import Callable
5
+
6
+
7
+ def json_serial(obj):
8
+ if isinstance(obj, Path):
9
+ return obj.as_posix()
10
+ if isinstance(obj, datetime):
11
+ return obj.isoformat()
12
+ raise TypeError("Type %s not serializable" % type(obj))
13
+
14
+
15
+ def sterilize_dict(data: dict, default: Callable = json_serial) -> dict:
16
+ data_s = json.dumps(data, default=default)
17
+ return json.loads(data_s)
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+ from urllib.parse import urlparse
8
+
9
+ from pydantic import Field, Secret, model_validator
10
+
11
+ from unstructured_ingest.error import SourceConnectionError
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ Downloader,
17
+ DownloaderConfig,
18
+ DownloadResponse,
19
+ FileData,
20
+ FileDataSourceMetadata,
21
+ Indexer,
22
+ IndexerConfig,
23
+ SourceIdentifiers,
24
+ )
25
+ from unstructured_ingest.v2.logger import logger
26
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
27
+
28
+ CONNECTOR_TYPE = "gitlab"
29
+ if TYPE_CHECKING:
30
+ from gitlab import Gitlab
31
+ from gitlab.v4.objects.projects import Project
32
+
33
+
34
+ class GitLabAccessConfig(AccessConfig):
35
+ access_token: Optional[str] = Field(
36
+ default=None,
37
+ description="Optional personal access token for authenticating with the GitLab API.",
38
+ )
39
+
40
+
41
+ class GitLabConnectionConfig(ConnectionConfig):
42
+ access_config: Secret[GitLabAccessConfig] = Field(
43
+ default_factory=GitLabAccessConfig,
44
+ validate_default=True,
45
+ description="Secret configuration for accessing the GitLab API by authentication token.",
46
+ )
47
+ url: str = Field(description="The full URL to the GitLab project or repository.")
48
+ base_url: str = Field(
49
+ default="https://gitlab.com",
50
+ description="The base URL for the GitLab instance (default is GitLab's public domain).",
51
+ )
52
+ repo_path: str = Field(
53
+ default=None,
54
+ init=False,
55
+ repr=False,
56
+ description="The normalized path extracted from the repository URL.",
57
+ )
58
+
59
+ @model_validator(mode="after")
60
+ def set_repo_path(self):
61
+ """
62
+ Parses the provided GitLab URL to extract the `base_url` and `repo_path`,
63
+ ensuring both are properly formatted for use.
64
+
65
+ If the URL contains a scheme (e.g., 'https') and a network location (e.g., 'gitlab.com'),
66
+ the `base_url` is set accordingly. The repository path is extracted and normalized
67
+ by removing any leading slashes.
68
+
69
+ Notes:
70
+ - If the URL contains both a scheme and network location, the `base_url` is
71
+ extracted directly from the URL.
72
+ - The `repo_path` is adjusted to remove any leading slashes.
73
+ - This method assumes that the URL follows GitLab's structure
74
+ (e.g., 'https://gitlab.com/owner/repo').
75
+ """
76
+ parsed_gh_url = urlparse(self.url)
77
+
78
+ if parsed_gh_url.scheme and parsed_gh_url.netloc:
79
+ self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
80
+ self.repo_path = parsed_gh_url.path.lstrip("/")
81
+
82
+ return self
83
+
84
+ @SourceConnectionError.wrap
85
+ @requires_dependencies(["gitlab"], extras="gitlab")
86
+ @contextmanager
87
+ def get_client(self) -> Generator["Gitlab", None, None]:
88
+ from gitlab import Gitlab
89
+
90
+ logger.info(f"Connection to GitLab: {self.base_url!r}")
91
+ with Gitlab(
92
+ self.base_url, private_token=self.access_config.get_secret_value().access_token
93
+ ) as client:
94
+ yield client
95
+
96
+ @contextmanager
97
+ def get_project(self) -> Generator["Project", None, None]:
98
+ """Retrieves the specified GitLab project using the configured base URL and access token.
99
+
100
+ Returns:
101
+ Project: A GitLab `Project` object representing the specified repository.
102
+
103
+ Raises:
104
+ SourceConnectionError: If the GitLab API connection fails.
105
+ gitlab.exceptions.GitlabGetError: If the project is not found.
106
+ """
107
+ with self.get_client() as client:
108
+ logger.info(f"Accessing Project: '{self.repo_path}'")
109
+ project = client.projects.get(self.repo_path)
110
+
111
+ logger.info(f"Successfully accessed project '{self.repo_path}'")
112
+ yield project
113
+
114
+
115
+ class GitLabIndexerConfig(IndexerConfig):
116
+ path: Path = Field(
117
+ default="/", description=("Path to the location in the repository that will be processed.")
118
+ )
119
+ recursive: bool = Field(
120
+ default=True,
121
+ description=(
122
+ "Flag to control recursive operations when indexing. "
123
+ "If True, the indexer will traverse directories recursively."
124
+ ),
125
+ )
126
+ git_branch: Optional[str] = Field(
127
+ default=None,
128
+ description="The name of the branch to interact with.",
129
+ )
130
+
131
+
132
+ @dataclass
133
+ class GitLabIndexer(Indexer):
134
+ connection_config: GitLabConnectionConfig
135
+ index_config: GitLabIndexerConfig
136
+
137
+ def precheck(self) -> None:
138
+ """Validates the connection to the GitLab instance by authenticating or
139
+ accessing the project.
140
+
141
+ This method ensures that the GitLab credentials and configuration are correct by
142
+ either authenticating or attempting to fetch the specified project.
143
+
144
+ Raises:
145
+ SourceConnectionError: If the connection or authentication with GitLab fails.
146
+ """
147
+
148
+ try:
149
+ with self.connection_config.get_client() as client:
150
+ if self.connection_config.access_config.get_secret_value().access_token is not None:
151
+ client.auth()
152
+ else:
153
+ client.projects.get(self.connection_config.repo_path)
154
+
155
+ except Exception as e:
156
+ logger.error(f"Failed to validate connection: {e}", exc_info=True)
157
+ raise SourceConnectionError(f"Failed to validate connection: {e}")
158
+
159
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
160
+ """Iterates over the GitLab repository tree and yields file metadata as `FileData` objects.
161
+
162
+ This method fetches the repository tree for the specified branch and iterates
163
+ over its contents. For each file (blob), it generates a `FileData` object containing
164
+ the file's metadata, path, and permissions.
165
+
166
+ Args:
167
+ **kwargs (Any): Additional keyword arguments (if required).
168
+
169
+ Yields:
170
+ FileData: A generator that yields `FileData` objects representing each file (blob)
171
+ in the repository.
172
+ """
173
+ with self.connection_config.get_project() as project:
174
+ ref = self.index_config.git_branch or project.default_branch
175
+
176
+ files = project.repository_tree(
177
+ path=str(self.index_config.path),
178
+ ref=ref,
179
+ recursive=self.index_config.recursive,
180
+ iterator=True,
181
+ all=True,
182
+ )
183
+
184
+ for file in files:
185
+ relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
186
+ if file["type"] == "blob":
187
+ record_locator = {
188
+ "file_path": file["path"],
189
+ "ref": ref,
190
+ }
191
+
192
+ yield FileData(
193
+ identifier=file["id"],
194
+ connector_type=CONNECTOR_TYPE,
195
+ source_identifiers=SourceIdentifiers(
196
+ fullpath=file["path"],
197
+ filename=Path(file["path"]).name,
198
+ rel_path=relative_path,
199
+ ),
200
+ metadata=FileDataSourceMetadata(
201
+ url=file["id"],
202
+ record_locator=record_locator,
203
+ permissions_data=[{"mode": file["mode"]}],
204
+ ),
205
+ additional_metadata={},
206
+ )
207
+
208
+
209
+ class GitLabDownloaderConfig(DownloaderConfig):
210
+ pass
211
+
212
+
213
+ @dataclass
214
+ class GitLabDownloader(Downloader):
215
+ connection_config: GitLabConnectionConfig
216
+ download_config: GitLabDownloaderConfig
217
+
218
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
219
+ """Downloads a file from the repository and returns a `DownloadResponse`.
220
+
221
+ Args:
222
+ file_data (FileData): Metadata about the file to be downloaded.
223
+ **kwargs (Any): Additional arguments (if required).
224
+
225
+ Returns:
226
+ DownloadResponse: A response object containing the download details.
227
+ """
228
+ download_path = self.get_download_path(file_data=file_data)
229
+ if download_path is None:
230
+ logger.error(
231
+ "Generated download path is None, source_identifiers might be missing"
232
+ "from FileData."
233
+ )
234
+ raise ValueError("Generated invalid download path.")
235
+
236
+ self._download_file(file_data, download_path)
237
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
238
+
239
+ def _download_file(self, file_data: FileData, download_path: Path) -> None:
240
+ # NOTE: Indexer should supply the record locator in metadata
241
+ if (
242
+ file_data.metadata.record_locator is None
243
+ or "ref" not in file_data.metadata.record_locator
244
+ or "file_path" not in file_data.metadata.record_locator
245
+ ):
246
+ logger.error(
247
+ f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
248
+ "Keys 'ref' and 'path' must be present."
249
+ )
250
+ raise ValueError("Invalid record locator.")
251
+
252
+ ref = file_data.metadata.record_locator["ref"]
253
+ path = file_data.metadata.record_locator["file_path"]
254
+ download_path.parent.mkdir(exist_ok=True, parents=True)
255
+
256
+ with self.connection_config.get_project() as project:
257
+ project_file = project.files.get(file_path=path, ref=ref)
258
+ with open(download_path, "wb") as file:
259
+ file.write(project_file.decode())
260
+
261
+
262
+ gitlab_source_entry = SourceRegistryEntry(
263
+ connection_config=GitLabConnectionConfig,
264
+ indexer_config=GitLabIndexerConfig,
265
+ indexer=GitLabIndexer,
266
+ downloader_config=GitLabDownloaderConfig,
267
+ downloader=GitLabDownloader,
268
+ )