unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,75 @@
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ from test.integration.connectors.utils.validation.utils import ValidationConfig
6
+ from unstructured_ingest.utils.data_prep import get_data
7
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers, UploadStager
8
+
9
+
10
+ class StagerValidationConfigs(ValidationConfig):
11
+ expected_count: int
12
+
13
+ def stager_output_dir(self) -> Path:
14
+ dir = self.test_output_dir() / "stager"
15
+ dir.mkdir(exist_ok=True, parents=True)
16
+ return dir
17
+
18
+ def stager_output_path(self, input_path: Path) -> Path:
19
+ return self.stager_output_dir() / input_path.name
20
+
21
+
22
+ def run_all_stager_validations(
23
+ configs: StagerValidationConfigs, input_file: Path, staged_filepath: Path
24
+ ):
25
+ # Validate matching extensions
26
+ assert input_file.suffix == staged_filepath.suffix
27
+
28
+ # Validate length
29
+ staged_data = get_data(path=staged_filepath)
30
+ assert len(staged_data) == configs.expected_count
31
+
32
+ # Validate file
33
+ expected_filepath = configs.stager_output_path(input_path=input_file)
34
+ assert expected_filepath.exists(), f"{expected_filepath} does not exist"
35
+ assert expected_filepath.is_file(), f"{expected_filepath} is not a file"
36
+ if configs.detect_diff(expected_filepath=expected_filepath, current_filepath=staged_filepath):
37
+ raise AssertionError(
38
+ f"Current file ({staged_filepath}) does not match expected file: {expected_filepath}"
39
+ )
40
+
41
+
42
+ def update_stager_fixtures(stager_output_path: Path, staged_filepath: Path):
43
+ copied_filepath = stager_output_path / staged_filepath.name
44
+ shutil.copy(staged_filepath, copied_filepath)
45
+
46
+
47
+ def stager_validation(
48
+ stager: UploadStager,
49
+ tmp_dir: Path,
50
+ input_file: Path,
51
+ configs: StagerValidationConfigs,
52
+ overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
53
+ ) -> None:
54
+ # Run stager
55
+ file_data = FileData(
56
+ source_identifiers=SourceIdentifiers(fullpath=input_file.name, filename=input_file.name),
57
+ connector_type=configs.test_id,
58
+ identifier="mock file data",
59
+ )
60
+ staged_filepath = stager.run(
61
+ elements_filepath=input_file,
62
+ file_data=file_data,
63
+ output_dir=tmp_dir,
64
+ output_filename=input_file.name,
65
+ )
66
+ if not overwrite_fixtures:
67
+ print("Running validation")
68
+ run_all_stager_validations(
69
+ configs=configs, input_file=input_file, staged_filepath=staged_filepath
70
+ )
71
+ else:
72
+ print("Running fixtures update")
73
+ update_stager_fixtures(
74
+ stager_output_path=configs.stager_output_dir(), staged_filepath=staged_filepath
75
+ )
@@ -0,0 +1,75 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import ndjson
5
+ from bs4 import BeautifulSoup
6
+ from deepdiff import DeepDiff
7
+
8
+
9
+ def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
10
+ with expected_filepath.open() as f:
11
+ expected_data = json.load(f)
12
+ with current_filepath.open() as f:
13
+ current_data = json.load(f)
14
+ diff = DeepDiff(expected_data, current_data)
15
+ if diff:
16
+ print("diff between expected and current json")
17
+ print(diff.to_json(indent=2))
18
+ return False
19
+ return True
20
+
21
+
22
+ def ndjson_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
23
+ with expected_filepath.open() as f:
24
+ expected_data = ndjson.load(f)
25
+ with current_filepath.open() as f:
26
+ current_data = ndjson.load(f)
27
+ if len(current_data) != len(expected_data):
28
+ print(
29
+ f"expected data length {len(expected_data)} "
30
+ f"didn't match current results: {len(current_data)}"
31
+ )
32
+ for i in range(len(expected_data)):
33
+ e = expected_data[i]
34
+ r = current_data[i]
35
+ if e != r:
36
+ print(f"{i}th element doesn't match:\nexpected {e}\ncurrent {r}")
37
+ return False
38
+ return True
39
+
40
+
41
+ def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
42
+ with expected_filepath.open() as expected_f:
43
+ expected_soup = BeautifulSoup(expected_f, "html.parser")
44
+ with current_filepath.open() as current_f:
45
+ current_soup = BeautifulSoup(current_f, "html.parser")
46
+ return expected_soup.text == current_soup.text
47
+
48
+
49
+ def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
50
+ with expected_filepath.open() as expected_f:
51
+ expected_text_lines = expected_f.readlines()
52
+ with current_filepath.open() as current_f:
53
+ current_text_lines = current_f.readlines()
54
+ if len(expected_text_lines) != len(current_text_lines):
55
+ print(
56
+ f"Lines in expected text file ({len(expected_text_lines)}) "
57
+ f"don't match current text file ({len(current_text_lines)})"
58
+ )
59
+ return False
60
+ expected_text = "\n".join(expected_text_lines)
61
+ current_text = "\n".join(current_text_lines)
62
+ if expected_text == current_text:
63
+ return True
64
+ print("txt content don't match:")
65
+ print(f"expected: {expected_text}")
66
+ print(f"current: {current_text}")
67
+ return False
68
+
69
+
70
+ file_type_equality_check = {
71
+ ".json": json_equality_check,
72
+ ".ndjson": ndjson_equality_check,
73
+ ".html": html_equality_check,
74
+ ".txt": txt_equality_check,
75
+ }
@@ -0,0 +1,299 @@
1
+ import json
2
+ import os
3
+ import shutil
4
+ from pathlib import Path
5
+ from typing import Callable, Optional
6
+
7
+ from deepdiff import DeepDiff
8
+ from pydantic import Field
9
+
10
+ from test.integration.connectors.utils.validation.utils import ValidationConfig
11
+ from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
12
+
13
+
14
+ class SourceValidationConfigs(ValidationConfig):
15
+ expected_number_indexed_file_data: Optional[int] = None
16
+ expected_num_files: Optional[int] = None
17
+ predownload_file_data_check: Optional[Callable[[FileData], None]] = None
18
+ postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
19
+ exclude_fields: list[str] = Field(
20
+ default_factory=lambda: ["local_download_path", "metadata.date_processed"]
21
+ )
22
+ exclude_fields_extend: list[str] = Field(default_factory=list)
23
+ validate_downloaded_files: bool = False
24
+ validate_file_data: bool = True
25
+
26
+ def get_exclude_fields(self) -> list[str]:
27
+ exclude_fields = self.exclude_fields
28
+ exclude_fields.extend(self.exclude_fields_extend)
29
+ return exclude_fields
30
+
31
+ def run_file_data_validation(
32
+ self, predownload_file_data: FileData, postdownload_file_data: FileData
33
+ ):
34
+ if predownload_file_data_check := self.predownload_file_data_check:
35
+ predownload_file_data_check(predownload_file_data)
36
+ if postdownload_file_data_check := self.postdownload_file_data_check:
37
+ postdownload_file_data_check(postdownload_file_data)
38
+
39
+ def run_download_dir_validation(self, download_dir: Path):
40
+ if expected_num_files := self.expected_num_files:
41
+ downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
42
+ assert len(downloaded_files) == expected_num_files
43
+
44
+ def omit_ignored_fields(self, data: dict) -> dict:
45
+ exclude_fields = self.get_exclude_fields()
46
+ # Ignore fields that dynamically change every time the tests run
47
+ copied_data = data.copy()
48
+ for exclude_field in exclude_fields:
49
+ exclude_field_vals = exclude_field.split(".")
50
+ if len(exclude_field_vals) == 1:
51
+ current_val = copied_data
52
+ drop_field = exclude_field_vals[0]
53
+ copied_data.pop(exclude_field_vals[0], None)
54
+ else:
55
+ current_val = copied_data
56
+ for val in exclude_field_vals[:-1]:
57
+ current_val = current_val.get(val, {})
58
+ drop_field = exclude_field_vals[-1]
59
+ if drop_field == "*":
60
+ current_val.clear()
61
+ else:
62
+ current_val.pop(drop_field, None)
63
+ return copied_data
64
+
65
+
66
+ def get_files(dir_path: Path) -> list[str]:
67
+ return [
68
+ str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file()
69
+ ]
70
+
71
+
72
+ def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
73
+ expected_files = get_files(dir_path=expected_output_dir)
74
+ current_files = [f"{file_data.identifier}.json" for file_data in all_file_data]
75
+ diff = set(expected_files) ^ set(current_files)
76
+ assert not diff, "diff in files that exist: {}".format(", ".join(diff))
77
+
78
+
79
+ def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
80
+ expected_files = get_files(dir_path=expected_output_dir)
81
+ current_files = get_files(dir_path=current_output_dir)
82
+ diff = set(expected_files) ^ set(current_files)
83
+ assert not diff, "diff in files that exist: {}".format(", ".join(diff))
84
+
85
+
86
+ def check_contents(
87
+ expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
88
+ ):
89
+ found_diff = False
90
+ for file_data in all_file_data:
91
+ file_data_path = expected_output_dir / f"{file_data.identifier}.json"
92
+ with file_data_path.open("r") as file:
93
+ expected_file_data_contents = json.load(file)
94
+ current_file_data_contents = file_data.model_dump()
95
+ expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
96
+ current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
97
+ diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
98
+ if diff:
99
+ found_diff = True
100
+ print(diff.to_json(indent=2))
101
+ assert not found_diff, f"Diffs found between files: {found_diff}"
102
+
103
+
104
+ def check_raw_file_contents(
105
+ expected_output_dir: Path,
106
+ current_output_dir: Path,
107
+ configs: SourceValidationConfigs,
108
+ ):
109
+ current_files = get_files(dir_path=current_output_dir)
110
+ found_diff = False
111
+ files = []
112
+ for current_file in current_files:
113
+ current_file_path = current_output_dir / current_file
114
+ expected_file_path = expected_output_dir / current_file
115
+ if configs.detect_diff(expected_file_path, current_file_path):
116
+ found_diff = True
117
+ files.append(str(expected_file_path))
118
+ print(f"diffs between files {expected_file_path} and {current_file_path}")
119
+ assert not found_diff, "Diffs found between files: {}".format(", ".join(files))
120
+
121
+
122
+ def run_expected_results_validation(
123
+ expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
124
+ ):
125
+ check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
126
+ check_contents(
127
+ expected_output_dir=expected_output_dir, all_file_data=all_file_data, configs=configs
128
+ )
129
+
130
+
131
+ def run_expected_download_files_validation(
132
+ expected_output_dir: Path,
133
+ current_download_dir: Path,
134
+ configs: SourceValidationConfigs,
135
+ ):
136
+ check_files_in_paths(
137
+ expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
138
+ )
139
+ check_raw_file_contents(
140
+ expected_output_dir=expected_output_dir,
141
+ current_output_dir=current_download_dir,
142
+ configs=configs,
143
+ )
144
+
145
+
146
+ def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
147
+ directory_record = expected_output_dir / "directory_structure.json"
148
+ with directory_record.open("r") as directory_file:
149
+ directory_file_contents = json.load(directory_file)
150
+ directory_structure = directory_file_contents["directory_structure"]
151
+ assert directory_structure == download_files
152
+
153
+
154
+ def update_fixtures(
155
+ output_dir: Path,
156
+ download_dir: Path,
157
+ all_file_data: list[FileData],
158
+ save_downloads: bool = False,
159
+ save_filedata: bool = True,
160
+ ):
161
+ # Rewrite the current file data
162
+ if not output_dir.exists():
163
+ output_dir.mkdir(parents=True)
164
+ if save_filedata:
165
+ file_data_output_path = output_dir / "file_data"
166
+ shutil.rmtree(path=file_data_output_path, ignore_errors=True)
167
+ print(
168
+ f"Writing {len(all_file_data)} file data to "
169
+ f"saved fixture location {file_data_output_path}"
170
+ )
171
+ file_data_output_path.mkdir(parents=True, exist_ok=True)
172
+ for file_data in all_file_data:
173
+ file_data_path = file_data_output_path / f"{file_data.identifier}.json"
174
+ with file_data_path.open(mode="w") as f:
175
+ json.dump(file_data.model_dump(), f, indent=2)
176
+
177
+ # Record file structure of download directory
178
+ download_files = get_files(dir_path=download_dir)
179
+ download_files.sort()
180
+ download_dir_record = output_dir / "directory_structure.json"
181
+ with download_dir_record.open(mode="w") as f:
182
+ json.dump({"directory_structure": download_files}, f, indent=2)
183
+
184
+ # If applicable, save raw downloads
185
+ if save_downloads:
186
+ raw_download_output_path = output_dir / "downloads"
187
+ shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
188
+ print(
189
+ f"Writing {len(download_files)} downloaded files to "
190
+ f"saved fixture location {raw_download_output_path}"
191
+ )
192
+ shutil.copytree(download_dir, raw_download_output_path)
193
+
194
+
195
+ def run_all_validations(
196
+ configs: SourceValidationConfigs,
197
+ predownload_file_data: list[FileData],
198
+ postdownload_file_data: list[FileData],
199
+ download_dir: Path,
200
+ test_output_dir: Path,
201
+ ):
202
+ if expected_number_indexed_file_data := configs.expected_number_indexed_file_data:
203
+ assert (
204
+ len(predownload_file_data) == expected_number_indexed_file_data
205
+ ), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
206
+ if expected_num_files := configs.expected_num_files:
207
+ assert len(postdownload_file_data) == expected_num_files
208
+
209
+ for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
210
+ configs.run_file_data_validation(
211
+ predownload_file_data=pre_data, postdownload_file_data=post_data
212
+ )
213
+ configs.run_download_dir_validation(download_dir=download_dir)
214
+ if configs.validate_file_data:
215
+ run_expected_results_validation(
216
+ expected_output_dir=test_output_dir / "file_data",
217
+ all_file_data=get_all_file_data(
218
+ all_predownload_file_data=predownload_file_data,
219
+ all_postdownload_file_data=postdownload_file_data,
220
+ ),
221
+ configs=configs,
222
+ )
223
+ download_files = get_files(dir_path=download_dir)
224
+ download_files.sort()
225
+ run_directory_structure_validation(
226
+ expected_output_dir=configs.test_output_dir(), download_files=download_files
227
+ )
228
+ if configs.validate_downloaded_files:
229
+ run_expected_download_files_validation(
230
+ expected_output_dir=test_output_dir / "downloads",
231
+ current_download_dir=download_dir,
232
+ configs=configs,
233
+ )
234
+
235
+
236
+ def get_all_file_data(
237
+ all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
238
+ ) -> list[FileData]:
239
+ all_file_data = all_postdownload_file_data
240
+ indexed_file_data = [
241
+ fd
242
+ for fd in all_predownload_file_data
243
+ if fd.identifier not in [f.identifier for f in all_file_data]
244
+ ]
245
+ all_file_data += indexed_file_data
246
+ return all_file_data
247
+
248
+
249
+ async def source_connector_validation(
250
+ indexer: Indexer,
251
+ downloader: Downloader,
252
+ configs: SourceValidationConfigs,
253
+ overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
254
+ ) -> None:
255
+ # Run common validations on the process of running a source connector, supporting dynamic
256
+ # validators that get passed in along with comparisons on the saved expected values.
257
+ # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
258
+ # expected values with what gets generated by this test.
259
+ all_predownload_file_data = []
260
+ all_postdownload_file_data = []
261
+ indexer.precheck()
262
+ download_dir = downloader.download_config.download_dir
263
+ test_output_dir = configs.test_output_dir()
264
+ for file_data in indexer.run():
265
+ assert file_data
266
+ predownload_file_data = file_data.model_copy(deep=True)
267
+ all_predownload_file_data.append(predownload_file_data)
268
+ if downloader.is_async():
269
+ resp = await downloader.run_async(file_data=file_data)
270
+ else:
271
+ resp = downloader.run(file_data=file_data)
272
+ if isinstance(resp, list):
273
+ for r in resp:
274
+ postdownload_file_data = r["file_data"].model_copy(deep=True)
275
+ all_postdownload_file_data.append(postdownload_file_data)
276
+ else:
277
+ postdownload_file_data = resp["file_data"].model_copy(deep=True)
278
+ all_postdownload_file_data.append(postdownload_file_data)
279
+ if not overwrite_fixtures:
280
+ print("Running validation")
281
+ run_all_validations(
282
+ configs=configs,
283
+ predownload_file_data=all_predownload_file_data,
284
+ postdownload_file_data=all_postdownload_file_data,
285
+ download_dir=download_dir,
286
+ test_output_dir=test_output_dir,
287
+ )
288
+ else:
289
+ print("Running fixtures update")
290
+ update_fixtures(
291
+ output_dir=test_output_dir,
292
+ download_dir=download_dir,
293
+ all_file_data=get_all_file_data(
294
+ all_predownload_file_data=all_predownload_file_data,
295
+ all_postdownload_file_data=all_postdownload_file_data,
296
+ ),
297
+ save_downloads=configs.validate_downloaded_files,
298
+ save_filedata=configs.validate_file_data,
299
+ )
@@ -0,0 +1,36 @@
1
+ import filecmp
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import Callable, Optional
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from test.integration.connectors.utils.constants import expected_results_path
9
+ from test.integration.connectors.utils.validation.equality import file_type_equality_check
10
+
11
+
12
+ class ValidationConfig(BaseModel):
13
+ test_id: str
14
+ file_equality_check: Optional[Callable[[Path, Path], bool]] = None
15
+
16
+ def test_output_dir(self) -> Path:
17
+ return expected_results_path / self.test_id
18
+
19
+ def detect_diff(self, expected_filepath: Path, current_filepath: Path) -> bool:
20
+ if expected_filepath.suffix != current_filepath.suffix:
21
+ return True
22
+ if file_equality_check := self.file_equality_check:
23
+ return not file_equality_check(expected_filepath, current_filepath)
24
+ current_suffix = expected_filepath.suffix
25
+ if current_suffix in file_type_equality_check:
26
+ equality_check_callable = file_type_equality_check[current_suffix]
27
+ return not equality_check_callable(
28
+ expected_filepath=expected_filepath, current_filepath=current_filepath
29
+ )
30
+ # Fallback is using filecmp.cmp to compare the files
31
+ return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
32
+
33
+
34
+ def reset_dir(dir_path: Path) -> None:
35
+ shutil.rmtree(path=dir_path, ignore_errors=True)
36
+ dir_path.mkdir(parents=True)
File without changes
@@ -0,0 +1,15 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+
6
+
7
+ @pytest.fixture
8
+ def collections_schema_config() -> dict:
9
+ int_test_dir = Path(__file__).parent
10
+ assets_dir = int_test_dir / "assets"
11
+ config_file = assets_dir / "elements.json"
12
+ assert config_file.exists()
13
+ assert config_file.is_file()
14
+ with config_file.open() as config_data:
15
+ return json.load(config_data)
@@ -0,0 +1,34 @@
1
+ import pytest
2
+ from pydantic import ValidationError
3
+
4
+ from unstructured_ingest.v2.processes.connectors.weaviate.cloud import (
5
+ CloudWeaviateAccessConfig,
6
+ CloudWeaviateConnectionConfig,
7
+ )
8
+
9
+
10
+ def test_weaviate_failing_connection_config():
11
+ with pytest.raises(ValidationError):
12
+ CloudWeaviateConnectionConfig(
13
+ access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
14
+ username="username",
15
+ cluster_url="clusterurl",
16
+ )
17
+
18
+
19
+ def test_weaviate_connection_config_happy_path():
20
+ CloudWeaviateConnectionConfig(
21
+ access_config=CloudWeaviateAccessConfig(
22
+ api_key="my key",
23
+ ),
24
+ cluster_url="clusterurl",
25
+ )
26
+
27
+
28
+ def test_weaviate_connection_config_anonymous():
29
+ CloudWeaviateConnectionConfig(
30
+ access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
31
+ username="username",
32
+ anonymous=True,
33
+ cluster_url="clusterurl",
34
+ )
@@ -0,0 +1,131 @@
1
+ import json
2
+ import time
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ import requests
7
+ import weaviate
8
+ from weaviate.client import WeaviateClient
9
+
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
11
+ from test.integration.connectors.utils.docker import container_context
12
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
13
+ from unstructured_ingest.v2.processes.connectors.weaviate.local import (
14
+ CONNECTOR_TYPE,
15
+ LocalWeaviateConnectionConfig,
16
+ LocalWeaviateUploader,
17
+ LocalWeaviateUploaderConfig,
18
+ LocalWeaviateUploadStager,
19
+ )
20
+
21
+ COLLECTION_NAME = "elements"
22
+
23
+
24
+ def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
25
+ start_time = time.time()
26
+ while time.time() - start_time < timeout:
27
+ try:
28
+ requests.get("http://localhost:8080/v1/.well-known/read")
29
+ return
30
+ except Exception as e:
31
+ print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
32
+ time.sleep(interval)
33
+ raise TimeoutError("Docker container never came up healthy")
34
+
35
+
36
+ @pytest.fixture
37
+ def collection(collections_schema_config: dict) -> str:
38
+ with container_context(
39
+ image="semitechnologies/weaviate:1.27.3",
40
+ ports={8080: 8080, 50051: 50051},
41
+ ):
42
+ wait_for_container()
43
+ with weaviate.connect_to_local() as weaviate_client:
44
+ weaviate_client.collections.create_from_dict(config=collections_schema_config)
45
+ yield COLLECTION_NAME
46
+
47
+
48
+ def get_count(client: WeaviateClient) -> int:
49
+ collection = client.collections.get(COLLECTION_NAME)
50
+ resp = collection.aggregate.over_all(total_count=True)
51
+ return resp.total_count
52
+
53
+
54
+ def validate_count(expected_count: int, retries: int = 10, interval: int = 1) -> None:
55
+ with weaviate.connect_to_local() as weaviate_client:
56
+ current_count = get_count(client=weaviate_client)
57
+ retry_count = 0
58
+ while current_count != expected_count and retry_count < retries:
59
+ retry_count += 1
60
+ time.sleep(interval)
61
+ current_count = get_count(client=weaviate_client)
62
+ assert current_count == expected_count, (
63
+ f"Expected count ({expected_count}) doesn't match how "
64
+ f"much came back from collection: {current_count}"
65
+ )
66
+
67
+
68
+ def run_uploader_and_validate(
69
+ uploader: LocalWeaviateUploader, path: Path, file_data: FileData, expected_count: int
70
+ ):
71
+ uploader.precheck()
72
+ uploader.run(path=path, file_data=file_data)
73
+ validate_count(expected_count=expected_count)
74
+
75
+
76
+ @pytest.mark.asyncio
77
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
78
+ def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
79
+ file_data = FileData(
80
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
81
+ connector_type=CONNECTOR_TYPE,
82
+ identifier="mock file data",
83
+ )
84
+ stager = LocalWeaviateUploadStager()
85
+
86
+ staged_filepath = stager.run(
87
+ elements_filepath=upload_file,
88
+ file_data=file_data,
89
+ output_dir=tmp_path,
90
+ output_filename=upload_file.name,
91
+ )
92
+ dynamic_uploader = LocalWeaviateUploader(
93
+ upload_config=LocalWeaviateUploaderConfig(
94
+ collection=COLLECTION_NAME,
95
+ ),
96
+ connection_config=LocalWeaviateConnectionConfig(),
97
+ )
98
+ fixed_size_uploader = LocalWeaviateUploader(
99
+ upload_config=LocalWeaviateUploaderConfig(
100
+ collection=COLLECTION_NAME, batch_size=10, dynamic_batch=False
101
+ ),
102
+ connection_config=LocalWeaviateConnectionConfig(),
103
+ )
104
+ rate_limited_uploader = LocalWeaviateUploader(
105
+ upload_config=LocalWeaviateUploaderConfig(
106
+ collection=COLLECTION_NAME, requests_per_minute=50, dynamic_batch=False
107
+ ),
108
+ connection_config=LocalWeaviateConnectionConfig(),
109
+ )
110
+ with staged_filepath.open() as f:
111
+ staged_elements = json.load(f)
112
+ expected_count = len(staged_elements)
113
+
114
+ run_uploader_and_validate(
115
+ uploader=dynamic_uploader,
116
+ path=staged_filepath,
117
+ file_data=file_data,
118
+ expected_count=expected_count,
119
+ )
120
+ run_uploader_and_validate(
121
+ uploader=fixed_size_uploader,
122
+ path=staged_filepath,
123
+ file_data=file_data,
124
+ expected_count=expected_count,
125
+ )
126
+ run_uploader_and_validate(
127
+ uploader=rate_limited_uploader,
128
+ path=staged_filepath,
129
+ file_data=file_data,
130
+ expected_count=expected_count,
131
+ )
File without changes
@@ -0,0 +1,13 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+
6
+ @pytest.fixture
7
+ def embedder_file() -> Path:
8
+ int_test_dir = Path(__file__).parent
9
+ assets_dir = int_test_dir / "assets"
10
+ embedder_file = assets_dir / "DA-1p-with-duplicate-pages.pdf.json"
11
+ assert embedder_file.exists()
12
+ assert embedder_file.is_file()
13
+ return embedder_file