unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,434 @@
1
+ import hashlib
2
+ import json
3
+ from abc import ABC, abstractmethod
4
+ from contextlib import contextmanager
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from time import time
9
+ from typing import Any, Generator, Union
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from dateutil import parser
14
+ from pydantic import BaseModel, Field, Secret
15
+
16
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
17
+ from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dataframe
18
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
19
+ from unstructured_ingest.v2.interfaces import (
20
+ AccessConfig,
21
+ BatchFileData,
22
+ BatchItem,
23
+ ConnectionConfig,
24
+ Downloader,
25
+ DownloaderConfig,
26
+ DownloadResponse,
27
+ FileData,
28
+ FileDataSourceMetadata,
29
+ Indexer,
30
+ IndexerConfig,
31
+ SourceIdentifiers,
32
+ Uploader,
33
+ UploaderConfig,
34
+ UploadStager,
35
+ UploadStagerConfig,
36
+ download_responses,
37
+ )
38
+ from unstructured_ingest.v2.logger import logger
39
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
40
+
41
+ _COLUMNS = (
42
+ "id",
43
+ "element_id",
44
+ "text",
45
+ "embeddings",
46
+ "type",
47
+ "system",
48
+ "layout_width",
49
+ "layout_height",
50
+ "points",
51
+ "url",
52
+ "version",
53
+ "date_created",
54
+ "date_modified",
55
+ "date_processed",
56
+ "permissions_data",
57
+ "record_locator",
58
+ "category_depth",
59
+ "parent_id",
60
+ "attached_filename",
61
+ "filetype",
62
+ "last_modified",
63
+ "file_directory",
64
+ "filename",
65
+ "languages",
66
+ "page_number",
67
+ "links",
68
+ "page_name",
69
+ "link_urls",
70
+ "link_texts",
71
+ "sent_from",
72
+ "sent_to",
73
+ "subject",
74
+ "section",
75
+ "header_footer_type",
76
+ "emphasized_text_contents",
77
+ "emphasized_text_tags",
78
+ "text_as_html",
79
+ "regex_metadata",
80
+ "detection_class_prob",
81
+ )
82
+
83
+ _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
84
+
85
+
86
+ class SqlAdditionalMetadata(BaseModel):
87
+ table_name: str
88
+ id_column: str
89
+
90
+
91
+ class SqlBatchFileData(BatchFileData):
92
+ additional_metadata: SqlAdditionalMetadata
93
+
94
+
95
+ def parse_date_string(date_value: Union[str, int]) -> datetime:
96
+ try:
97
+ timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
98
+ return datetime.fromtimestamp(timestamp)
99
+ except Exception as e:
100
+ logger.debug(f"date {date_value} string not a timestamp: {e}")
101
+ return parser.parse(date_value)
102
+
103
+
104
+ class SQLAccessConfig(AccessConfig):
105
+ pass
106
+
107
+
108
+ class SQLConnectionConfig(ConnectionConfig, ABC):
109
+ access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
110
+
111
+ @abstractmethod
112
+ @contextmanager
113
+ def get_connection(self) -> Generator[Any, None, None]:
114
+ pass
115
+
116
+ @abstractmethod
117
+ @contextmanager
118
+ def get_cursor(self) -> Generator[Any, None, None]:
119
+ pass
120
+
121
+
122
+ class SQLIndexerConfig(IndexerConfig):
123
+ table_name: str
124
+ id_column: str
125
+ batch_size: int = 100
126
+
127
+
128
+ class SQLIndexer(Indexer, ABC):
129
+ connection_config: SQLConnectionConfig
130
+ index_config: SQLIndexerConfig
131
+
132
+ def _get_doc_ids(self) -> list[str]:
133
+ with self.connection_config.get_cursor() as cursor:
134
+ cursor.execute(
135
+ f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
136
+ )
137
+ results = cursor.fetchall()
138
+ ids = sorted([result[0] for result in results])
139
+ return ids
140
+
141
+ def precheck(self) -> None:
142
+ try:
143
+ with self.connection_config.get_cursor() as cursor:
144
+ cursor.execute("SELECT 1;")
145
+ except Exception as e:
146
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
147
+ raise SourceConnectionError(f"failed to validate connection: {e}")
148
+
149
+ def run(self, **kwargs: Any) -> Generator[SqlBatchFileData, None, None]:
150
+ ids = self._get_doc_ids()
151
+ id_batches: list[frozenset[str]] = [
152
+ frozenset(
153
+ ids[
154
+ i
155
+ * self.index_config.batch_size : (i + 1) # noqa
156
+ * self.index_config.batch_size
157
+ ]
158
+ )
159
+ for i in range(
160
+ (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
161
+ )
162
+ ]
163
+ for batch in id_batches:
164
+ # Make sure the hash is always a positive number to create identified
165
+ yield SqlBatchFileData(
166
+ connector_type=self.connector_type,
167
+ metadata=FileDataSourceMetadata(
168
+ date_processed=str(time()),
169
+ ),
170
+ additional_metadata=SqlAdditionalMetadata(
171
+ table_name=self.index_config.table_name, id_column=self.index_config.id_column
172
+ ),
173
+ batch_items=[BatchItem(identifier=str(b)) for b in batch],
174
+ )
175
+
176
+
177
+ class SQLDownloaderConfig(DownloaderConfig):
178
+ fields: list[str] = field(default_factory=list)
179
+
180
+
181
+ class SQLDownloader(Downloader, ABC):
182
+ connection_config: SQLConnectionConfig
183
+ download_config: SQLDownloaderConfig
184
+
185
+ @abstractmethod
186
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
187
+ pass
188
+
189
+ def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[pd.DataFrame]:
190
+ data = [dict(zip(columns, row)) for row in rows]
191
+ df = pd.DataFrame(data)
192
+ dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
193
+ return dfs
194
+
195
+ def get_data(self, file_data: SqlBatchFileData) -> list[pd.DataFrame]:
196
+ rows, columns = self.query_db(file_data=file_data)
197
+ return self.sql_to_df(rows=rows, columns=columns)
198
+
199
+ def get_identifier(self, table_name: str, record_id: str) -> str:
200
+ f = f"{table_name}-{record_id}"
201
+ if self.download_config.fields:
202
+ f = "{}-{}".format(
203
+ f,
204
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
205
+ )
206
+ return f
207
+
208
+ def generate_download_response(
209
+ self, result: pd.DataFrame, file_data: SqlBatchFileData
210
+ ) -> DownloadResponse:
211
+ id_column = file_data.additional_metadata.id_column
212
+ table_name = file_data.additional_metadata.table_name
213
+ record_id = result.iloc[0][id_column]
214
+ filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
215
+ filename = f"{filename_id}.csv"
216
+ download_path = self.download_dir / Path(filename)
217
+ logger.debug(
218
+ f"Downloading results from table {table_name} and id {record_id} to {download_path}"
219
+ )
220
+ download_path.parent.mkdir(parents=True, exist_ok=True)
221
+ result.to_csv(download_path, index=False)
222
+ file_data.source_identifiers = SourceIdentifiers(
223
+ filename=filename,
224
+ fullpath=filename,
225
+ )
226
+ cast_file_data = FileData.cast(file_data=file_data)
227
+ cast_file_data.identifier = filename_id
228
+ return super().generate_download_response(
229
+ file_data=cast_file_data, download_path=download_path
230
+ )
231
+
232
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
233
+ sql_filedata = SqlBatchFileData.cast(file_data=file_data)
234
+ data_dfs = self.get_data(file_data=sql_filedata)
235
+ download_responses = []
236
+ for df in data_dfs:
237
+ download_responses.append(
238
+ self.generate_download_response(result=df, file_data=sql_filedata)
239
+ )
240
+ return download_responses
241
+
242
+
243
+ class SQLUploadStagerConfig(UploadStagerConfig):
244
+ pass
245
+
246
+
247
+ @dataclass
248
+ class SQLUploadStager(UploadStager):
249
+ upload_stager_config: SQLUploadStagerConfig = field(default_factory=SQLUploadStagerConfig)
250
+
251
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
252
+ data = element_dict.copy()
253
+ metadata: dict[str, Any] = data.pop("metadata", {})
254
+ data_source = metadata.pop("data_source", {})
255
+ coordinates = metadata.pop("coordinates", {})
256
+
257
+ data.update(metadata)
258
+ data.update(data_source)
259
+ data.update(coordinates)
260
+
261
+ data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
262
+
263
+ # remove extraneous, not supported columns
264
+ element = {k: v for k, v in data.items() if k in _COLUMNS}
265
+ element[RECORD_ID_LABEL] = file_data.identifier
266
+ return element
267
+
268
+ def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
269
+ for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
270
+ df[column] = df[column].apply(parse_date_string).apply(lambda date: date.timestamp())
271
+ for column in filter(
272
+ lambda x: x in df.columns,
273
+ ("permissions_data", "record_locator", "points", "links"),
274
+ ):
275
+ df[column] = df[column].apply(
276
+ lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
277
+ )
278
+ for column in filter(
279
+ lambda x: x in df.columns,
280
+ ("version", "page_number", "regex_metadata"),
281
+ ):
282
+ df[column] = df[column].apply(str)
283
+ return df
284
+
285
+ def run(
286
+ self,
287
+ elements_filepath: Path,
288
+ file_data: FileData,
289
+ output_dir: Path,
290
+ output_filename: str,
291
+ **kwargs: Any,
292
+ ) -> Path:
293
+ elements_contents = get_data(path=elements_filepath)
294
+
295
+ df = pd.DataFrame(
296
+ data=[
297
+ self.conform_dict(element_dict=element_dict, file_data=file_data)
298
+ for element_dict in elements_contents
299
+ ]
300
+ )
301
+ df = self.conform_dataframe(df=df)
302
+
303
+ output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
304
+
305
+ self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
306
+ return output_path
307
+
308
+
309
+ class SQLUploaderConfig(UploaderConfig):
310
+ batch_size: int = Field(default=50, description="Number of records per batch")
311
+ table_name: str = Field(default="elements", description="which table to upload contents to")
312
+ record_id_key: str = Field(
313
+ default=RECORD_ID_LABEL,
314
+ description="searchable key to find entries for the same record on previous runs",
315
+ )
316
+
317
+
318
+ @dataclass
319
+ class SQLUploader(Uploader):
320
+ upload_config: SQLUploaderConfig
321
+ connection_config: SQLConnectionConfig
322
+ values_delimiter: str = "?"
323
+
324
+ def precheck(self) -> None:
325
+ try:
326
+ with self.connection_config.get_cursor() as cursor:
327
+ cursor.execute("SELECT 1;")
328
+ except Exception as e:
329
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
330
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
331
+
332
+ def prepare_data(
333
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
334
+ ) -> list[tuple[Any, ...]]:
335
+ output = []
336
+ for row in data:
337
+ parsed = []
338
+ for column_name, value in zip(columns, row):
339
+ if column_name in _DATE_COLUMNS:
340
+ if value is None:
341
+ parsed.append(None)
342
+ else:
343
+ parsed.append(parse_date_string(value))
344
+ else:
345
+ parsed.append(value)
346
+ output.append(tuple(parsed))
347
+ return output
348
+
349
+ def _fit_to_schema(self, df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
350
+ columns = set(df.columns)
351
+ schema_fields = set(columns)
352
+ columns_to_drop = columns - schema_fields
353
+ missing_columns = schema_fields - columns
354
+
355
+ if columns_to_drop:
356
+ logger.warning(
357
+ "Following columns will be dropped to match the table's schema: "
358
+ f"{', '.join(columns_to_drop)}"
359
+ )
360
+ if missing_columns:
361
+ logger.info(
362
+ "Following null filled columns will be added to match the table's schema:"
363
+ f" {', '.join(missing_columns)} "
364
+ )
365
+
366
+ df = df.drop(columns=columns_to_drop)
367
+
368
+ for column in missing_columns:
369
+ df[column] = pd.Series()
370
+
371
+ def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
372
+ if self.can_delete():
373
+ self.delete_by_record_id(file_data=file_data)
374
+ else:
375
+ logger.warning(
376
+ f"table doesn't contain expected "
377
+ f"record id column "
378
+ f"{self.upload_config.record_id_key}, skipping delete"
379
+ )
380
+ df.replace({np.nan: None}, inplace=True)
381
+ self._fit_to_schema(df=df, columns=self.get_table_columns())
382
+
383
+ columns = list(df.columns)
384
+ stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
385
+ table_name=self.upload_config.table_name,
386
+ columns=",".join(columns),
387
+ values=",".join([self.values_delimiter for _ in columns]),
388
+ )
389
+ logger.info(
390
+ f"writing a total of {len(df)} elements via"
391
+ f" document batches to destination"
392
+ f" table named {self.upload_config.table_name}"
393
+ f" with batch size {self.upload_config.batch_size}"
394
+ )
395
+ for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
396
+ with self.connection_config.get_cursor() as cursor:
397
+ values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
398
+ # For debugging purposes:
399
+ # for val in values:
400
+ # try:
401
+ # cursor.execute(stmt, val)
402
+ # except Exception as e:
403
+ # print(f"Error: {e}")
404
+ # print(f"failed to write {len(columns)}, {len(val)}: {stmt} -> {val}")
405
+ logger.debug(f"running query: {stmt}")
406
+ cursor.executemany(stmt, values)
407
+
408
+ def get_table_columns(self) -> list[str]:
409
+ with self.connection_config.get_cursor() as cursor:
410
+ cursor.execute(f"SELECT * from {self.upload_config.table_name}")
411
+ return [desc[0] for desc in cursor.description]
412
+
413
+ def can_delete(self) -> bool:
414
+ return self.upload_config.record_id_key in self.get_table_columns()
415
+
416
+ def delete_by_record_id(self, file_data: FileData) -> None:
417
+ logger.debug(
418
+ f"deleting any content with data "
419
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
420
+ f"from table {self.upload_config.table_name}"
421
+ )
422
+ stmt = f"DELETE FROM {self.upload_config.table_name} WHERE {self.upload_config.record_id_key} = {self.values_delimiter}" # noqa: E501
423
+ with self.connection_config.get_cursor() as cursor:
424
+ cursor.execute(stmt, [file_data.identifier])
425
+ rowcount = cursor.rowcount
426
+ logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
427
+
428
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
429
+ df = pd.DataFrame(data)
430
+ self.upload_dataframe(df=df, file_data=file_data)
431
+
432
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
433
+ df = get_data_df(path=path)
434
+ self.upload_dataframe(df=df, file_data=file_data)
@@ -0,0 +1,168 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator
6
+
7
+ from pydantic import Field, Secret, model_validator
8
+
9
+ from unstructured_ingest.v2.logger import logger
10
+ from unstructured_ingest.v2.processes.connector_registry import (
11
+ DestinationRegistryEntry,
12
+ SourceRegistryEntry,
13
+ )
14
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
15
+ _DATE_COLUMNS,
16
+ SQLAccessConfig,
17
+ SqlBatchFileData,
18
+ SQLConnectionConfig,
19
+ SQLDownloader,
20
+ SQLDownloaderConfig,
21
+ SQLIndexer,
22
+ SQLIndexerConfig,
23
+ SQLUploader,
24
+ SQLUploaderConfig,
25
+ SQLUploadStager,
26
+ SQLUploadStagerConfig,
27
+ parse_date_string,
28
+ )
29
+
30
+ if TYPE_CHECKING:
31
+ from sqlite3 import Connection as SqliteConnection
32
+ from sqlite3 import Cursor as SqliteCursor
33
+
34
+ CONNECTOR_TYPE = "sqlite"
35
+
36
+
37
+ class SQLiteAccessConfig(SQLAccessConfig):
38
+ pass
39
+
40
+
41
+ class SQLiteConnectionConfig(SQLConnectionConfig):
42
+ access_config: Secret[SQLiteAccessConfig] = Field(
43
+ default=SQLiteAccessConfig(), validate_default=True
44
+ )
45
+ database_path: Path = Field(
46
+ description="Path to the .db file.",
47
+ )
48
+
49
+ @model_validator(mode="after")
50
+ def check_database_path(self) -> "SQLiteConnectionConfig":
51
+ if not self.database_path.exists():
52
+ raise ValueError(f"{self.database_path} does not exist")
53
+ if not self.database_path.is_file():
54
+ raise ValueError(f"{self.database_path} is not a valid file")
55
+ return self
56
+
57
+ @contextmanager
58
+ def get_connection(self) -> Generator["SqliteConnection", None, None]:
59
+ from sqlite3 import connect
60
+
61
+ connection = connect(database=self.database_path)
62
+ try:
63
+ yield connection
64
+ finally:
65
+ connection.commit()
66
+ connection.close()
67
+
68
+ @contextmanager
69
+ def get_cursor(self) -> Generator["SqliteCursor", None, None]:
70
+ with self.get_connection() as connection:
71
+ cursor = connection.cursor()
72
+ try:
73
+ yield cursor
74
+ finally:
75
+ cursor.close()
76
+
77
+
78
+ class SQLiteIndexerConfig(SQLIndexerConfig):
79
+ pass
80
+
81
+
82
+ @dataclass
83
+ class SQLiteIndexer(SQLIndexer):
84
+ connection_config: SQLConnectionConfig
85
+ index_config: SQLIndexerConfig
86
+ connector_type: str = CONNECTOR_TYPE
87
+
88
+
89
+ class SQLiteDownloaderConfig(SQLDownloaderConfig):
90
+ pass
91
+
92
+
93
+ @dataclass
94
+ class SQLiteDownloader(SQLDownloader):
95
+ connection_config: SQLConnectionConfig
96
+ download_config: SQLDownloaderConfig
97
+ connector_type: str = CONNECTOR_TYPE
98
+ values_delimiter: str = "?"
99
+
100
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
101
+ table_name = file_data.additional_metadata.table_name
102
+ id_column = file_data.additional_metadata.id_column
103
+ ids = [item.identifier for item in file_data.batch_items]
104
+ with self.connection_config.get_connection() as sqlite_connection:
105
+ cursor = sqlite_connection.cursor()
106
+ fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
107
+ values = ",".join(self.values_delimiter for _ in ids)
108
+ query = f"SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})"
109
+ logger.debug(f"running query: {query}\nwith values: {ids}")
110
+ cursor.execute(query, ids)
111
+ rows = cursor.fetchall()
112
+ columns = [col[0] for col in cursor.description]
113
+ return rows, columns
114
+
115
+
116
+ class SQLiteUploadStagerConfig(SQLUploadStagerConfig):
117
+ pass
118
+
119
+
120
+ class SQLiteUploadStager(SQLUploadStager):
121
+ upload_stager_config: SQLiteUploadStagerConfig
122
+
123
+
124
+ class SQLiteUploaderConfig(SQLUploaderConfig):
125
+ pass
126
+
127
+
128
+ @dataclass
129
+ class SQLiteUploader(SQLUploader):
130
+ upload_config: SQLiteUploaderConfig = field(default_factory=SQLiteUploaderConfig)
131
+ connection_config: SQLiteConnectionConfig
132
+ connector_type: str = CONNECTOR_TYPE
133
+
134
+ def prepare_data(
135
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
136
+ ) -> list[tuple[Any, ...]]:
137
+ output = []
138
+ for row in data:
139
+ parsed = []
140
+ for column_name, value in zip(columns, row):
141
+ if isinstance(value, (list, dict)):
142
+ value = json.dumps(value)
143
+ if column_name in _DATE_COLUMNS:
144
+ if value is None:
145
+ parsed.append(None)
146
+ else:
147
+ parsed.append(parse_date_string(value))
148
+ else:
149
+ parsed.append(value)
150
+ output.append(tuple(parsed))
151
+ return output
152
+
153
+
154
+ sqlite_source_entry = SourceRegistryEntry(
155
+ connection_config=SQLiteConnectionConfig,
156
+ indexer_config=SQLiteIndexerConfig,
157
+ indexer=SQLiteIndexer,
158
+ downloader_config=SQLiteDownloaderConfig,
159
+ downloader=SQLiteDownloader,
160
+ )
161
+
162
+ sqlite_destination_entry = DestinationRegistryEntry(
163
+ connection_config=SQLiteConnectionConfig,
164
+ uploader=SQLiteUploader,
165
+ uploader_config=SQLiteUploaderConfig,
166
+ upload_stager=SQLiteUploadStager,
167
+ upload_stager_config=SQLiteUploadStagerConfig,
168
+ )
@@ -0,0 +1,29 @@
1
+ import json
2
+ from datetime import datetime
3
+ from typing import Any, Union
4
+
5
+ from dateutil import parser
6
+ from pydantic import ValidationError
7
+
8
+
9
+ def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime:
10
+ if isinstance(date_value, datetime):
11
+ return date_value
12
+ elif isinstance(date_value, float):
13
+ return datetime.fromtimestamp(date_value)
14
+ elif isinstance(date_value, int):
15
+ return datetime.fromtimestamp(date_value / 1000)
16
+
17
+ try:
18
+ timestamp = float(date_value)
19
+ return datetime.fromtimestamp(timestamp)
20
+ except ValueError:
21
+ return parser.parse(date_value)
22
+
23
+
24
+ def conform_string_to_dict(value: Any) -> dict:
25
+ if isinstance(value, dict):
26
+ return value
27
+ if isinstance(value, str):
28
+ return json.loads(value)
29
+ raise ValidationError(f"Input could not be mapped to a valid dict: {value}")