unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,89 @@
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from pathlib import Path
4
+ from typing import Any, Optional, TypedDict, TypeVar, Union
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from unstructured_ingest.v2.interfaces.connector import BaseConnector
9
+ from unstructured_ingest.v2.interfaces.file_data import FileData
10
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
11
+
12
+
13
+ class DownloaderConfig(BaseModel):
14
+ download_dir: Optional[Path] = Field(
15
+ default=None,
16
+ description="Where files are downloaded to, defaults to a location at"
17
+ "`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
18
+ )
19
+
20
+
21
+ DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
22
+
23
+
24
+ class DownloadResponse(TypedDict):
25
+ file_data: FileData
26
+ path: Path
27
+
28
+
29
+ download_responses = Union[list[DownloadResponse], DownloadResponse]
30
+
31
+
32
+ class Downloader(BaseProcess, BaseConnector, ABC):
33
+ connector_type: str
34
+ download_config: DownloaderConfigT
35
+
36
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
37
+ if not file_data.source_identifiers:
38
+ return None
39
+ rel_path = file_data.source_identifiers.relative_path
40
+ if not rel_path:
41
+ return None
42
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
43
+ return self.download_dir / Path(rel_path)
44
+
45
+ @staticmethod
46
+ def is_float(value: str):
47
+ try:
48
+ float(value)
49
+ return True
50
+ except ValueError:
51
+ return False
52
+
53
+ def generate_download_response(
54
+ self, file_data: FileData, download_path: Path
55
+ ) -> DownloadResponse:
56
+ if (
57
+ file_data.metadata.date_modified
58
+ and self.is_float(file_data.metadata.date_modified)
59
+ and file_data.metadata.date_created
60
+ and self.is_float(file_data.metadata.date_created)
61
+ ):
62
+ date_modified = float(file_data.metadata.date_modified)
63
+ date_created = float(file_data.metadata.date_created)
64
+ os.utime(download_path, times=(date_created, date_modified))
65
+ file_data.local_download_path = str(download_path.resolve())
66
+ return DownloadResponse(file_data=file_data, path=download_path)
67
+
68
+ @property
69
+ def download_dir(self) -> Path:
70
+ if self.download_config.download_dir is None:
71
+ self.download_config.download_dir = (
72
+ Path.home()
73
+ / ".cache"
74
+ / "unstructured"
75
+ / "ingest"
76
+ / "download"
77
+ / self.connector_type
78
+ ).resolve()
79
+ return self.download_config.download_dir
80
+
81
+ def is_async(self) -> bool:
82
+ return True
83
+
84
+ @abstractmethod
85
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
86
+ pass
87
+
88
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
89
+ return self.run(file_data=file_data, **kwargs)
@@ -0,0 +1,116 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Optional
4
+ from uuid import NAMESPACE_DNS, uuid5
5
+
6
+ from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
7
+
8
+ from unstructured_ingest.v2.logger import logger
9
+
10
+
11
+ class SourceIdentifiers(BaseModel):
12
+ filename: str
13
+ fullpath: str
14
+ rel_path: Optional[str] = None
15
+
16
+ @property
17
+ def filename_stem(self) -> str:
18
+ return Path(self.filename).stem
19
+
20
+ @property
21
+ def relative_path(self) -> str:
22
+ return self.rel_path or self.fullpath
23
+
24
+
25
+ class FileDataSourceMetadata(BaseModel):
26
+ url: Optional[str] = None
27
+ version: Optional[str] = None
28
+ record_locator: Optional[dict[str, Any]] = None
29
+ date_created: Optional[str] = None
30
+ date_modified: Optional[str] = None
31
+ date_processed: Optional[str] = None
32
+ permissions_data: Optional[list[dict[str, Any]]] = None
33
+ filesize_bytes: Optional[int] = None
34
+
35
+
36
+ class FileData(BaseModel):
37
+ identifier: str
38
+ connector_type: str
39
+ source_identifiers: SourceIdentifiers
40
+ metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
41
+ additional_metadata: dict[str, Any] = Field(default_factory=dict)
42
+ reprocess: bool = False
43
+ local_download_path: Optional[str] = None
44
+ display_name: Optional[str] = None
45
+
46
+ @classmethod
47
+ def from_file(cls, path: str) -> "FileData":
48
+ path = Path(path).resolve()
49
+ if not path.exists() or not path.is_file():
50
+ raise ValueError(f"file path not valid: {path}")
51
+ with open(str(path.resolve()), "rb") as f:
52
+ file_data_dict = json.load(f)
53
+ file_data = cls.model_validate(file_data_dict)
54
+ return file_data
55
+
56
+ @classmethod
57
+ def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
58
+ file_data_dict = file_data.model_dump()
59
+ return cls.model_validate(file_data_dict, **kwargs)
60
+
61
+ def to_file(self, path: str) -> None:
62
+ path = Path(path).resolve()
63
+ path.parent.mkdir(parents=True, exist_ok=True)
64
+ with open(str(path.resolve()), "w") as f:
65
+ json.dump(self.model_dump(), f, indent=2)
66
+
67
+
68
+ class BatchItem(BaseModel):
69
+ identifier: str
70
+ version: Optional[str] = None
71
+
72
+
73
+ class BatchFileData(FileData):
74
+ identifier: str = Field(init=False)
75
+ batch_items: list[BatchItem]
76
+ source_identifiers: Optional[SourceIdentifiers] = None
77
+
78
+ @field_validator("batch_items")
79
+ @classmethod
80
+ def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
81
+ if not v:
82
+ raise ValueError("batch items cannot be empty")
83
+ all_identifiers = [item.identifier for item in v]
84
+ if len(all_identifiers) != len(set(all_identifiers)):
85
+ raise ValueError(f"duplicate identifiers: {all_identifiers}")
86
+ sorted_batch_items = sorted(v, key=lambda item: item.identifier)
87
+ return sorted_batch_items
88
+
89
+ @model_validator(mode="before")
90
+ @classmethod
91
+ def populate_identifier(cls, data: Any) -> Any:
92
+ if isinstance(data, dict) and "identifier" not in data:
93
+ batch_items = data["batch_items"]
94
+ identifier_data = json.dumps(
95
+ {item.identifier: item.version for item in batch_items}, sort_keys=True
96
+ )
97
+ data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
98
+ return data
99
+
100
+
101
+ def file_data_from_file(path: str) -> FileData:
102
+ try:
103
+ return BatchFileData.from_file(path=path)
104
+ except ValidationError:
105
+ logger.debug(f"{path} not valid for batch file data")
106
+
107
+ return FileData.from_file(path=path)
108
+
109
+
110
+ def file_data_from_dict(data: dict) -> FileData:
111
+ try:
112
+ return BatchFileData.model_validate(data)
113
+ except ValidationError:
114
+ logger.debug(f"{data} not valid for batch file data")
115
+
116
+ return FileData.model_validate(data)
@@ -0,0 +1,30 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from unstructured_ingest.v2.interfaces.connector import BaseConnector
7
+ from unstructured_ingest.v2.interfaces.file_data import FileData
8
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
9
+
10
+
11
+ class IndexerConfig(BaseModel):
12
+ pass
13
+
14
+
15
+ IndexerConfigT = TypeVar("IndexerConfigT", bound=IndexerConfig)
16
+
17
+
18
+ class Indexer(BaseProcess, BaseConnector, ABC):
19
+ connector_type: str
20
+ index_config: Optional[IndexerConfigT] = None
21
+
22
+ def is_async(self) -> bool:
23
+ return False
24
+
25
+ @abstractmethod
26
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
27
+ pass
28
+
29
+ async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
30
+ raise NotImplementedError()
@@ -0,0 +1,19 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Any
4
+
5
+
6
+ @dataclass
7
+ class BaseProcess(ABC):
8
+ def is_async(self) -> bool:
9
+ return False
10
+
11
+ def precheck(self) -> None:
12
+ pass
13
+
14
+ @abstractmethod
15
+ def run(self, **kwargs: Any) -> Any:
16
+ pass
17
+
18
+ async def run_async(self, **kwargs: Any) -> Any:
19
+ return self.run(**kwargs)
@@ -0,0 +1,88 @@
1
+ import os
2
+ from asyncio import Semaphore
3
+ from pathlib import Path
4
+ from typing import Any, Optional
5
+
6
+ from pydantic import BaseModel, ConfigDict, Field
7
+
8
+ DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
9
+
10
+
11
+ class ProcessorConfig(BaseModel):
12
+ model_config = ConfigDict(arbitrary_types_allowed=True)
13
+
14
+ reprocess: bool = Field(
15
+ default=False,
16
+ description="Reprocess a downloaded file even if the relevant structured "
17
+ "output .json file in output directory already exists.",
18
+ )
19
+ verbose: bool = Field(default=False)
20
+ tqdm: bool = Field(default=False, description="Display tqdm progress bar")
21
+ work_dir: str = Field(
22
+ default_factory=lambda: DEFAULT_WORK_DIR,
23
+ description="Where to place working files when processing each step",
24
+ )
25
+ num_processes: int = Field(
26
+ default=2, description="Number of parallel processes with which to process docs"
27
+ )
28
+ max_connections: Optional[int] = Field(
29
+ default=None, description="Limit of concurrent connectionts"
30
+ )
31
+ raise_on_error: bool = Field(
32
+ default=False,
33
+ description="Is set, will raise error if any doc in the pipeline fail. "
34
+ "Otherwise will log error and continue with other docs",
35
+ )
36
+ disable_parallelism: bool = Field(
37
+ default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true",
38
+ )
39
+ preserve_downloads: bool = Field(
40
+ default=False, description="Don't delete downloaded files after process completes"
41
+ )
42
+ download_only: bool = Field(
43
+ default=False, description="skip the rest of the process after files are downloaded"
44
+ )
45
+ re_download: bool = Field(
46
+ default=False,
47
+ description="If set, will re-download downloaded files "
48
+ "regardless of if they already exist locally",
49
+ )
50
+ uncompress: bool = Field(
51
+ default=False,
52
+ description="Uncompress any archived files. Currently supporting "
53
+ "zip and tar files based on file extension.",
54
+ )
55
+ iter_delete: bool = Field(
56
+ default=False,
57
+ description="If limited on memory, this can be enabled to delete "
58
+ "cached content as it's used and no longer needed in the pipeline.",
59
+ )
60
+ delete_cache: bool = Field(
61
+ default=False,
62
+ description="If set, will delete the cache work directory when process finishes",
63
+ )
64
+
65
+ # OTEL support
66
+ otel_endpoint: Optional[str] = Field(
67
+ default=None, description="OTEL endpoint to publish trace data to"
68
+ )
69
+
70
+ # Used to keep track of state in pipeline
71
+ status: dict = Field(default_factory=dict)
72
+ semaphore: Optional[Semaphore] = Field(init=False, default=None, exclude=True)
73
+
74
+ def model_post_init(self, __context: Any) -> None:
75
+ if self.max_connections is not None:
76
+ self.semaphore = Semaphore(self.max_connections)
77
+
78
+ @property
79
+ def mp_supported(self) -> bool:
80
+ return not self.disable_parallelism and self.num_processes > 1
81
+
82
+ @property
83
+ def async_supported(self) -> bool:
84
+ if self.disable_parallelism:
85
+ return False
86
+ if self.max_connections is not None and isinstance(self.max_connections, int):
87
+ return self.max_connections > 1
88
+ return True
@@ -0,0 +1,102 @@
1
+ import json
2
+ from abc import ABC
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any, TypeVar
6
+
7
+ import ndjson
8
+ from pydantic import BaseModel
9
+
10
+ from unstructured_ingest.v2.interfaces.file_data import FileData
11
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
12
+
13
+
14
+ class UploadStagerConfig(BaseModel):
15
+ pass
16
+
17
+
18
+ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
19
+
20
+
21
+ @dataclass
22
+ class UploadStager(BaseProcess, ABC):
23
+ upload_stager_config: UploadStagerConfigT
24
+
25
+ def write_output(self, output_path: Path, data: list[dict]) -> None:
26
+ if output_path.suffix == ".json":
27
+ with output_path.open("w") as f:
28
+ json.dump(data, f, indent=2)
29
+ elif output_path.suffix == ".ndjson":
30
+ with output_path.open("w") as f:
31
+ ndjson.dump(data, f)
32
+ else:
33
+ raise ValueError(f"Unsupported output format: {output_path}")
34
+
35
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
36
+ return element_dict
37
+
38
+ def get_output_path(self, output_filename: str, output_dir: Path) -> Path:
39
+ output_path = Path(output_filename)
40
+ output_filename = f"{Path(output_filename).stem}{output_path.suffix}"
41
+ output_path = Path(output_dir) / Path(f"{output_filename}")
42
+ output_path.parent.mkdir(parents=True, exist_ok=True)
43
+ return output_path
44
+
45
+ def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
46
+ with input_file.open() as in_f:
47
+ reader = ndjson.reader(in_f)
48
+ with output_file.open("w") as out_f:
49
+ writer = ndjson.writer(out_f)
50
+ for element in reader:
51
+ conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
52
+ writer.writerow(row=conformed_element)
53
+ writer.f.flush()
54
+
55
+ def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
56
+ with input_file.open() as in_f:
57
+ elements_contents = json.load(in_f)
58
+
59
+ conformed_elements = [
60
+ self.conform_dict(element_dict=element, file_data=file_data)
61
+ for element in elements_contents
62
+ ]
63
+
64
+ with open(output_file, "w") as out_f:
65
+ json.dump(conformed_elements, out_f, indent=2)
66
+
67
+ def run(
68
+ self,
69
+ elements_filepath: Path,
70
+ file_data: FileData,
71
+ output_dir: Path,
72
+ output_filename: str,
73
+ **kwargs: Any,
74
+ ) -> Path:
75
+ output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
76
+ if elements_filepath.suffix == ".ndjson":
77
+ self.stream_update(
78
+ input_file=elements_filepath, output_file=output_file, file_data=file_data
79
+ )
80
+ elif elements_filepath.suffix == ".json":
81
+ self.process_whole(
82
+ input_file=elements_filepath, output_file=output_file, file_data=file_data
83
+ )
84
+ else:
85
+ raise ValueError(f"Unsupported file extension: {elements_filepath}")
86
+ return output_file
87
+
88
+ async def run_async(
89
+ self,
90
+ elements_filepath: Path,
91
+ file_data: FileData,
92
+ output_dir: Path,
93
+ output_filename: str,
94
+ **kwargs: Any,
95
+ ) -> Path:
96
+ return self.run(
97
+ elements_filepath=elements_filepath,
98
+ output_dir=output_dir,
99
+ output_filename=output_filename,
100
+ file_data=file_data,
101
+ **kwargs,
102
+ )
@@ -0,0 +1,53 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Any, TypeVar
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from unstructured_ingest.utils.data_prep import get_data
9
+ from unstructured_ingest.v2.interfaces.connector import BaseConnector
10
+ from unstructured_ingest.v2.interfaces.file_data import FileData
11
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
12
+
13
+
14
+ class UploaderConfig(BaseModel):
15
+ pass
16
+
17
+
18
+ UploaderConfigT = TypeVar("UploaderConfigT", bound=UploaderConfig)
19
+
20
+
21
+ @dataclass
22
+ class UploadContent:
23
+ path: Path
24
+ file_data: FileData
25
+
26
+
27
+ @dataclass
28
+ class Uploader(BaseProcess, BaseConnector, ABC):
29
+ upload_config: UploaderConfigT
30
+ connector_type: str
31
+
32
+ def is_async(self) -> bool:
33
+ return False
34
+
35
+ def is_batch(self) -> bool:
36
+ return False
37
+
38
+ def run_batch(self, contents: list[UploadContent], **kwargs: Any) -> None:
39
+ raise NotImplementedError()
40
+
41
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
42
+ data = get_data(path=path)
43
+ self.run_data(data=data, file_data=file_data, **kwargs)
44
+
45
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
46
+ data = get_data(path=path)
47
+ await self.run_data_async(data=data, file_data=file_data, **kwargs)
48
+
49
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
50
+ raise NotImplementedError()
51
+
52
+ async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
53
+ return self.run_data(data=data, file_data=file_data, **kwargs)
@@ -0,0 +1,126 @@
1
+ import ast
2
+ import json
3
+ import os
4
+ from logging import Formatter, Logger, StreamHandler, getLevelName, getLogger
5
+ from typing import Any, Callable
6
+
7
+ log_level = os.getenv("INGEST_LOG_LEVEL", "INFO")
8
+ LOGGER_NAME = "unstructured_ingest.v2"
9
+
10
+
11
+ def default_is_data_sensitive(k: str, v: Any) -> bool:
12
+ sensitive_fields = [
13
+ "account_name",
14
+ "client_id",
15
+ ]
16
+ sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
17
+ return (
18
+ v
19
+ and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
20
+ or k.lower() in sensitive_fields
21
+ )
22
+
23
+
24
+ def hide_sensitive_fields(
25
+ data: dict, is_sensitive_fn: Callable[[str, Any], bool] = default_is_data_sensitive
26
+ ) -> dict:
27
+ """
28
+ Will recursively look through every k, v pair in this dict and any nested ones and run
29
+ is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
30
+ any string value can be parsed as valid json and process that dict as well and replace
31
+ the original string with the json.dumps() version of the redacted dict.
32
+ """
33
+ new_data = data.copy()
34
+ for k, v in new_data.items():
35
+ if is_sensitive_fn(k, v):
36
+ new_data[k] = "*******"
37
+ if isinstance(v, dict):
38
+ new_data[k] = hide_sensitive_fields(v)
39
+ if isinstance(v, str):
40
+ # Need to take into account strings generated via json.dumps() or simply printing a dict
41
+ try:
42
+ json_data = json.loads(v)
43
+ if isinstance(json_data, dict):
44
+ updated_data = hide_sensitive_fields(json_data)
45
+ new_data[k] = json.dumps(updated_data)
46
+ except json.JSONDecodeError:
47
+ pass
48
+
49
+ return new_data
50
+
51
+
52
+ def redact_jsons(s: str) -> str:
53
+ """
54
+ Takes in a generic string and pulls out all valid json content. Leverages
55
+ hide_sensitive_fields() to redact any sensitive information and replaces the
56
+ original json with the new redacted format. There can be any number of valid
57
+ jsons in a generic string and this will work. Having extra '{' without a
58
+ closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
59
+
60
+ """
61
+ chars = list(s)
62
+ if "{" not in chars:
63
+ return s
64
+ i = 0
65
+ jsons = []
66
+ i = 0
67
+ while i < len(chars):
68
+ char = chars[i]
69
+ if char == "{":
70
+ stack = [char]
71
+ current = [char]
72
+ while len(stack) != 0 and i < len(chars):
73
+ i += 1
74
+ char = chars[i]
75
+ current.append(char)
76
+ if char == "{":
77
+ stack.append(char)
78
+ if char == "}":
79
+ stack.pop(-1)
80
+ jsons.append("".join(current))
81
+ continue
82
+ i += 1
83
+ for j in jsons:
84
+ try:
85
+ formatted_j = json.dumps(json.loads(j))
86
+ except json.JSONDecodeError:
87
+ lit = ast.literal_eval(j)
88
+ formatted_j = json.dumps(lit)
89
+ hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
90
+ s = s.replace(j, hidden_j)
91
+ return s
92
+
93
+
94
+ class SensitiveFormatter(Formatter):
95
+ def format(self, record):
96
+ s = super().format(record=record)
97
+ try:
98
+ return redact_jsons(s)
99
+ except Exception:
100
+ return f"Failed to redact: {s}"
101
+
102
+
103
+ def remove_root_handlers(logger: Logger) -> None:
104
+ # NOTE(robinson): in some environments such as Google Colab, there is a root handler
105
+ # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
106
+ # Removing these when they exist prevents this behavior
107
+ if logger.root.hasHandlers():
108
+ for handler in logger.root.handlers:
109
+ logger.root.removeHandler(handler)
110
+
111
+
112
+ def make_default_logger(level: int) -> Logger:
113
+ """Return a custom logger."""
114
+ logger = getLogger(LOGGER_NAME)
115
+ handler = StreamHandler()
116
+ handler.name = "ingest_log_handler"
117
+ formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
118
+ handler.setFormatter(formatter)
119
+ if handler.name not in [h.name for h in logger.handlers]:
120
+ logger.addHandler(handler)
121
+ logger.setLevel(level)
122
+ remove_root_handlers(logger)
123
+ return logger
124
+
125
+
126
+ logger = make_default_logger(level=getLevelName(log_level.upper()))
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env python3
2
+ from unstructured_ingest.v2.cli.cli import get_cmd
3
+
4
+
5
+ def main():
6
+ ingest_cmd = get_cmd()
7
+ ingest_cmd()
8
+
9
+
10
+ if __name__ == "__main__":
11
+ main()