unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
  279. unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,320 +0,0 @@
1
- import os
2
- import typing as t
3
- import urllib.request
4
- from dataclasses import dataclass
5
- from ftplib import FTP, error_perm
6
- from pathlib import Path
7
-
8
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
9
- from unstructured_ingest.interfaces import (
10
- BaseConnectorConfig,
11
- BaseSingleIngestDoc,
12
- BaseSourceConnector,
13
- IngestDocCleanupMixin,
14
- SourceConnectorCleanupMixin,
15
- )
16
- from unstructured_ingest.logger import logger
17
- from unstructured_ingest.utils.data_prep import (
18
- validate_date_args,
19
- )
20
- from unstructured_ingest.utils.dep_check import requires_dependencies
21
-
22
- if t.TYPE_CHECKING:
23
- from requests import Response, Session
24
-
25
- DOMAIN = "ftp.ncbi.nlm.nih.gov"
26
- FTP_DOMAIN = f"ftp://{DOMAIN}"
27
- PMC_DIR = "pub/pmc"
28
- PDF_DIR = "oa_pdf"
29
-
30
-
31
- @dataclass
32
- class BiomedFileMeta:
33
- ftp_path: str
34
- download_filepath: str
35
- output_filepath: str
36
-
37
-
38
- @dataclass
39
- class SimpleBiomedConfig(BaseConnectorConfig):
40
- """Connector config where path is the FTP directory path and
41
- id_, from_, until, format are API parameters."""
42
-
43
- path: t.Optional[str] = None
44
- # OA Web Service API Options
45
- api_id: t.Optional[str] = None
46
- api_from: t.Optional[str] = None
47
- api_until: t.Optional[str] = None
48
- max_request_time: int = 45
49
-
50
- def validate_api_inputs(self):
51
- valid = False
52
-
53
- if self.api_from:
54
- valid = validate_date_args(self.api_from)
55
-
56
- if self.api_until:
57
- valid = validate_date_args(self.api_until)
58
-
59
- return valid
60
-
61
- def __post_init__(self):
62
- self.is_file = False
63
- self.is_dir = False
64
- self.is_api = False
65
-
66
- if not self.path:
67
- is_valid = self.validate_api_inputs()
68
- if not is_valid:
69
- raise ValueError(
70
- "Path argument or at least one of the "
71
- "OA Web Service arguments MUST be provided.",
72
- )
73
-
74
- self.is_api = True
75
- else:
76
- self.path = self.path.strip("/")
77
- is_valid = self.path.lower().startswith(PDF_DIR)
78
-
79
- if not is_valid:
80
- raise ValueError(f"Path MUST start with {PDF_DIR}")
81
-
82
- ftp = FTP(DOMAIN)
83
- ftp.login()
84
-
85
- path = Path(PMC_DIR) / self.path
86
- response = ""
87
- try:
88
- if path.suffix == ".pdf":
89
- response = ftp.cwd(str(path.parent))
90
- self.is_file = True
91
- else:
92
- response = ftp.cwd(str(path))
93
- except error_perm as exc:
94
- if "no such file or directory" in exc.args[0].lower():
95
- raise ValueError(f"The path: {path} is not valid.")
96
- elif "not a directory" in exc.args[0].lower():
97
- self.is_file = True
98
- elif "command successful" in response:
99
- self.is_dir = True
100
- else:
101
- raise ValueError(
102
- "Something went wrong when validating the path: {path}.",
103
- )
104
-
105
-
106
- @dataclass
107
- class BiomedIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
108
- connector_config: SimpleBiomedConfig
109
- file_meta: BiomedFileMeta
110
- registry_name: str = "biomed"
111
-
112
- @property
113
- def filename(self):
114
- return Path(self.file_meta.download_filepath).resolve() # type: ignore
115
-
116
- @property
117
- def _output_filename(self):
118
- return Path(f"{self.file_meta.output_filepath}.json").resolve()
119
-
120
- def cleanup_file(self):
121
- if (
122
- not self.read_config.preserve_downloads
123
- and self.filename.is_file()
124
- and not self.read_config.download_only
125
- ):
126
- logger.debug(f"cleaning up {self}")
127
- Path.unlink(self.filename)
128
-
129
- @SourceConnectionError.wrap
130
- @BaseSingleIngestDoc.skip_if_file_exists
131
- def get_file(self):
132
- download_path = self.file_meta.download_filepath # type: ignore
133
- dir_ = Path(os.path.dirname(download_path)) # type: ignore
134
- if not dir_.is_dir():
135
- logger.debug(f"creating directory: {dir_}")
136
-
137
- if dir_:
138
- dir_.mkdir(parents=True, exist_ok=True)
139
- self._retrieve()
140
- logger.debug(f"file downloaded: {self.file_meta.download_filepath}")
141
-
142
- @SourceConnectionNetworkError.wrap
143
- def _retrieve(self):
144
- urllib.request.urlretrieve(
145
- self.file_meta.ftp_path, # type: ignore
146
- self.file_meta.download_filepath,
147
- )
148
-
149
-
150
- class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
151
- """Objects of this class support fetching documents from Biomedical literature FTP directory"""
152
-
153
- connector_config: SimpleBiomedConfig
154
-
155
- def get_base_endpoints_url(self) -> str:
156
- endpoint_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?format=pdf"
157
-
158
- if self.connector_config.api_id:
159
- endpoint_url += f"&id={self.connector_config.api_id}"
160
-
161
- if self.connector_config.api_from:
162
- endpoint_url += f"&from={self.connector_config.api_from}"
163
-
164
- if self.connector_config.api_until:
165
- endpoint_url += f"&until={self.connector_config.api_until}"
166
-
167
- return endpoint_url
168
-
169
- @requires_dependencies(["requests"], extras="biomed")
170
- def _list_objects_api(self) -> t.List[BiomedFileMeta]:
171
- from bs4 import BeautifulSoup
172
- from requests import Session
173
- from requests.adapters import HTTPAdapter
174
-
175
- def urls_to_metadata(urls):
176
- files = []
177
- for url in urls:
178
- parts = url.split(PDF_DIR)
179
- if len(parts) > 1:
180
- local_path = parts[1].strip("/")
181
- files.append(
182
- BiomedFileMeta(
183
- ftp_path=url,
184
- download_filepath=(Path(self.read_config.download_dir) / local_path)
185
- .resolve()
186
- .as_posix(),
187
- output_filepath=(Path(self.processor_config.output_dir) / local_path)
188
- .resolve()
189
- .as_posix(),
190
- ),
191
- )
192
-
193
- return files
194
-
195
- files: t.List[BiomedFileMeta] = []
196
-
197
- endpoint_url = self.get_base_endpoints_url()
198
-
199
- while endpoint_url:
200
- session = Session()
201
- adapter = HTTPAdapter()
202
- session.mount("http://", adapter)
203
- session.mount("https://", adapter)
204
- response = self._get_request(session=session, endpoint_url=endpoint_url)
205
- soup = BeautifulSoup(response.content, features="lxml")
206
- urls = [link["href"] for link in soup.find_all("link")]
207
-
208
- if not urls:
209
- return files
210
-
211
- endpoint_url = urls[-1] if "resumptiontoken" in urls[-1].lower() else None
212
- if endpoint_url:
213
- urls = urls[:-1]
214
-
215
- files.extend(urls_to_metadata(urls))
216
-
217
- return files
218
-
219
- @SourceConnectionNetworkError.wrap
220
- def _get_request(self, session: "Session", endpoint_url: str) -> "Response":
221
- return session.get(endpoint_url, timeout=self.connector_config.max_request_time)
222
-
223
- def _list_objects(self) -> t.List[BiomedFileMeta]:
224
- files = []
225
-
226
- # Conform to mypy, null check performed elsewhere.
227
- # Wouldn't be in this method unless self.config.path exists
228
- path: str = self.connector_config.path if self.connector_config.path else ""
229
-
230
- def traverse(path, download_dir, output_dir):
231
- full_path = Path(PMC_DIR) / path
232
- logger.debug(f"traversing directory: {full_path}")
233
-
234
- ftp = FTP(DOMAIN)
235
- ftp.login()
236
-
237
- try:
238
- response = ftp.cwd(str(full_path))
239
- except error_perm:
240
- raise ValueError(f"{full_path} is not a valid directory.")
241
-
242
- if "command successful" in response.lower():
243
- sub_paths = [path / p for p in ftp.nlst()]
244
-
245
- if not sub_paths:
246
- return
247
-
248
- ext = Path(sub_paths[0]).suffix
249
- if ext:
250
- for sub_path in sub_paths:
251
- ftp_path = f"{FTP_DOMAIN}/{PMC_DIR}/{sub_path}"
252
- local_path = "/".join(str(sub_path).split("/")[1:])
253
- files.append(
254
- BiomedFileMeta(
255
- ftp_path=ftp_path,
256
- download_filepath=(Path(self.read_config.download_dir) / local_path)
257
- .resolve()
258
- .as_posix(),
259
- output_filepath=(
260
- Path(self.processor_config.output_dir) / local_path
261
- )
262
- .resolve()
263
- .as_posix(),
264
- ),
265
- )
266
-
267
- else:
268
- for sub_path in sub_paths:
269
- traverse(sub_path, download_dir, output_dir)
270
-
271
- else:
272
- raise ValueError(f"{full_path} is not a valid directory.")
273
-
274
- ftp_path = f"{FTP_DOMAIN}/{PMC_DIR}/{self.connector_config.path}"
275
- if self.connector_config.is_file:
276
- local_path = "/".join(path.split("/")[1:])
277
- return [
278
- BiomedFileMeta(
279
- ftp_path=ftp_path,
280
- download_filepath=(Path(self.read_config.download_dir) / local_path)
281
- .resolve()
282
- .as_posix(),
283
- output_filepath=(Path(self.processor_config.output_dir) / local_path)
284
- .resolve()
285
- .as_posix(),
286
- ),
287
- ]
288
- else:
289
- traverse(
290
- Path(path),
291
- Path(self.read_config.download_dir),
292
- Path(self.processor_config.output_dir),
293
- )
294
-
295
- return files
296
-
297
- def initialize(self):
298
- pass
299
-
300
- @requires_dependencies(["requests"], extras="biomed")
301
- def check_connection(self):
302
- import requests
303
-
304
- resp = requests.head(self.get_base_endpoints_url())
305
- try:
306
- resp.raise_for_status()
307
- except requests.HTTPError as http_error:
308
- raise SourceConnectionError(f"failed to validate connection: {http_error}")
309
-
310
- def get_ingest_docs(self):
311
- files = self._list_objects_api() if self.connector_config.is_api else self._list_objects()
312
- return [
313
- BiomedIngestDoc(
314
- processor_config=self.processor_config,
315
- connector_config=self.connector_config,
316
- read_config=self.read_config,
317
- file_meta=file,
318
- )
319
- for file in files
320
- ]
@@ -1,158 +0,0 @@
1
- import copy
2
- import typing as t
3
- import uuid
4
- from dataclasses import dataclass
5
-
6
- from unstructured_ingest.enhanced_dataclass.core import _asdict
7
- from unstructured_ingest.error import DestinationConnectionError
8
- from unstructured_ingest.interfaces import (
9
- AccessConfig,
10
- BaseConnectorConfig,
11
- BaseDestinationConnector,
12
- WriteConfig,
13
- )
14
- from unstructured_ingest.logger import logger
15
- from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
16
- from unstructured_ingest.utils.dep_check import requires_dependencies
17
-
18
- if t.TYPE_CHECKING:
19
- from chromadb import Collection as ChromaCollection
20
-
21
-
22
- @dataclass
23
- class ChromaAccessConfig(AccessConfig):
24
- settings: t.Optional[t.Dict[str, str]] = None
25
- headers: t.Optional[t.Dict[str, str]] = None
26
-
27
-
28
- @dataclass
29
- class SimpleChromaConfig(BaseConnectorConfig):
30
- access_config: ChromaAccessConfig
31
- collection_name: str
32
- path: t.Optional[str] = None
33
- tenant: t.Optional[str] = "default_tenant"
34
- database: t.Optional[str] = "default_database"
35
- host: t.Optional[str] = None
36
- port: t.Optional[int] = None
37
- ssl: bool = False
38
-
39
-
40
- @dataclass
41
- class ChromaWriteConfig(WriteConfig):
42
- batch_size: int = 100
43
-
44
-
45
- @dataclass
46
- class ChromaDestinationConnector(BaseDestinationConnector):
47
- write_config: ChromaWriteConfig
48
- connector_config: SimpleChromaConfig
49
- _collection: t.Optional["ChromaCollection"] = None
50
-
51
- @property
52
- def chroma_collection(self):
53
- if self._collection is None:
54
- self._collection = self.create_collection()
55
- return self._collection
56
-
57
- def initialize(self):
58
- pass
59
-
60
- @DestinationConnectionError.wrap
61
- def check_connection(self):
62
- _ = self.chroma_collection
63
-
64
- def to_dict(self, **kwargs):
65
- """
66
- The _collection variable in this dataclass breaks deepcopy due to:
67
- TypeError: cannot pickle 'module' object
68
- When serializing, remove it, meaning collection data will need to be reinitialized
69
- when deserialized
70
- """
71
- self_cp = copy.copy(self)
72
- if hasattr(self_cp, "_collection"):
73
- setattr(self_cp, "_collection", None)
74
- return _asdict(self_cp, **kwargs)
75
-
76
- @requires_dependencies(["chromadb"], extras="chroma")
77
- def create_collection(self) -> "ChromaCollection":
78
- import chromadb
79
-
80
- if self.connector_config.path:
81
- chroma_client = chromadb.PersistentClient(
82
- path=self.connector_config.path,
83
- settings=self.connector_config.settings,
84
- tenant=self.connector_config.tenant,
85
- database=self.connector_config.database,
86
- )
87
-
88
- elif self.connector_config.host and self.connector_config.port:
89
- chroma_client = chromadb.HttpClient(
90
- host=self.connector_config.host,
91
- port=self.connector_config.port,
92
- ssl=self.connector_config.ssl,
93
- headers=self.connector_config.access_config.headers,
94
- settings=self.connector_config.access_config.settings,
95
- tenant=self.connector_config.tenant,
96
- database=self.connector_config.database,
97
- )
98
- else:
99
- raise ValueError("Chroma connector requires either path or host and port to be set.")
100
-
101
- collection = chroma_client.get_or_create_collection(
102
- name=self.connector_config.collection_name
103
- )
104
- return collection
105
-
106
- @DestinationConnectionError.wrap
107
- @requires_dependencies(["chromadb"], extras="chroma")
108
- def upsert_batch(self, batch):
109
- collection = self.chroma_collection
110
-
111
- try:
112
- # Chroma wants lists even if there is only one element
113
- # Upserting to prevent duplicates
114
- collection.upsert(
115
- ids=batch["ids"],
116
- documents=batch["documents"],
117
- embeddings=batch["embeddings"],
118
- metadatas=batch["metadatas"],
119
- )
120
- except Exception as e:
121
- raise ValueError(f"chroma error: {e}") from e
122
-
123
- @staticmethod
124
- def prepare_chroma_list(chunk: t.Tuple[t.Dict[str, t.Any]]) -> t.Dict[str, t.List[t.Any]]:
125
- """Helper function to break a tuple of dicts into list of parallel lists for ChromaDb.
126
- ({'id':1}, {'id':2}, {'id':3}) -> {'ids':[1,2,3]}"""
127
- chroma_dict = {}
128
- chroma_dict["ids"] = [x.get("id") for x in chunk]
129
- chroma_dict["documents"] = [x.get("document") for x in chunk]
130
- chroma_dict["embeddings"] = [x.get("embedding") for x in chunk]
131
- chroma_dict["metadatas"] = [x.get("metadata") for x in chunk]
132
- # Make sure all lists are of the same length
133
- assert (
134
- len(chroma_dict["ids"])
135
- == len(chroma_dict["documents"])
136
- == len(chroma_dict["embeddings"])
137
- == len(chroma_dict["metadatas"])
138
- )
139
- return chroma_dict
140
-
141
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
142
- logger.info(f"inserting / updating {len(elements_dict)} documents to destination ")
143
-
144
- chroma_batch_size = self.write_config.batch_size
145
-
146
- for chunk in batch_generator(elements_dict, chroma_batch_size):
147
- self.upsert_batch(self.prepare_chroma_list(chunk))
148
-
149
- def normalize_dict(self, element_dict: dict) -> dict:
150
- element_id = element_dict.get("element_id", str(uuid.uuid4()))
151
- return {
152
- "id": element_id,
153
- "embedding": element_dict.pop("embeddings", None),
154
- "document": element_dict.pop("text", None),
155
- "metadata": flatten_dict(
156
- element_dict, separator="-", flatten_lists=True, remove_none=True
157
- ),
158
- }
@@ -1,122 +0,0 @@
1
- import typing as t
2
- import uuid
3
- from dataclasses import dataclass, field
4
-
5
- from unstructured_ingest.enhanced_dataclass import enhanced_field
6
- from unstructured_ingest.error import DestinationConnectionError
7
- from unstructured_ingest.interfaces import (
8
- AccessConfig,
9
- BaseConnectorConfig,
10
- BaseDestinationConnector,
11
- WriteConfig,
12
- )
13
- from unstructured_ingest.logger import logger
14
- from unstructured_ingest.utils.data_prep import flatten_dict
15
- from unstructured_ingest.utils.dep_check import requires_dependencies
16
-
17
- if t.TYPE_CHECKING:
18
- from clarifai.client.input import Inputs
19
-
20
-
21
- @dataclass
22
- class ClarifaiAccessConfig(AccessConfig):
23
- api_key: str = enhanced_field(sensitive=True)
24
-
25
-
26
- @dataclass
27
- class SimpleClarifaiConfig(BaseConnectorConfig):
28
- access_config: ClarifaiAccessConfig
29
- app_id: str
30
- user_id: str
31
- dataset_id: t.Optional[str] = None
32
-
33
-
34
- @dataclass
35
- class ClarifaiWriteConfig(WriteConfig):
36
- batch_size: int = 50
37
-
38
-
39
- @dataclass
40
- class ClarifaiDestinationConnector(BaseDestinationConnector):
41
- write_config: ClarifaiWriteConfig
42
- connector_config: SimpleClarifaiConfig
43
- _client: t.Optional["Inputs"] = field(init=False, default=None)
44
-
45
- @property
46
- @requires_dependencies(["clarifai"], extras="clarifai")
47
- def client(self) -> "Inputs":
48
- if self._client is None:
49
- from clarifai.client.input import Inputs
50
-
51
- access_conf = self.connector_config.access_config
52
- try:
53
- if access_conf.api_key is not None:
54
- clarifai_pat = access_conf.api_key
55
- except Exception as e:
56
- raise (f"please provide clarifai PAT key : {e}")
57
-
58
- self._client = Inputs(
59
- app_id=self.connector_config.app_id,
60
- user_id=self.connector_config.user_id,
61
- pat=clarifai_pat,
62
- )
63
- return self._client
64
-
65
- @requires_dependencies(["clarifai"], extras="clarifai")
66
- @DestinationConnectionError.wrap
67
- def initialize(self):
68
- _ = self.client
69
-
70
- def check_connection(self):
71
- try:
72
- _ = [inp for inp in self.client.list_inputs(page_no=1, per_page=1)] # noqa: C416
73
- except Exception as e:
74
- logger.error(f"Failed to validate connection {e}", exc_info=True)
75
- raise DestinationConnectionError(f"failed to validate connection: {e}")
76
-
77
- def normalize_dict(self, element_dict: dict) -> dict:
78
- """Modifying schema of the dict in order to compile with clarifai input formats"""
79
- return {
80
- "input_id": str(uuid.uuid4().hex),
81
- "text": element_dict.pop("text", None),
82
- "metadata": {
83
- **flatten_dict(
84
- element_dict,
85
- separator="_",
86
- flatten_lists=True,
87
- remove_none=True,
88
- ),
89
- },
90
- }
91
-
92
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
93
- from google.protobuf.struct_pb2 import Struct
94
-
95
- logger.info(
96
- f"writing {len(elements_dict)} objects to destination "
97
- f"app {self.connector_config.app_id} "
98
- )
99
- try:
100
- batch_size = self.write_config.batch_size
101
- for idx in range(0, len(elements_dict), batch_size):
102
- batch_dict = elements_dict[idx : batch_size + idx]
103
- input_batch = []
104
- for elem in batch_dict:
105
- meta_struct = Struct()
106
- meta_struct.update(elem["metadata"])
107
- input_batch.append(
108
- self._client.get_text_input(
109
- input_id=elem["input_id"],
110
- raw_text=elem["text"],
111
- dataset_id=self.connector_config.dataset_id,
112
- metadata=meta_struct,
113
- )
114
- )
115
- result_id = self._client.upload_inputs(inputs=input_batch)
116
- logger.debug(
117
- f"Input posted successfully into {self.connector_config.app_id}. \
118
- Result id: {result_id}"
119
- )
120
-
121
- except Exception as e:
122
- raise e