unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
  279. unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,109 +0,0 @@
1
- """
2
- Box Connector
3
- Box does not make it simple to download files with an App.
4
- First of all, this does not work with a free Box account.
5
- Make sure the App service email is a collaborator for your folder (co-owner or editor)
6
- Make sure you have the 'write all files' application scope
7
- Maybe check 'Make api calls as the as-user header'
8
- REAUTHORIZE app after making any of the above changes
9
- """
10
-
11
- import typing as t
12
- from dataclasses import dataclass
13
-
14
- from unstructured_ingest.connector.fsspec.fsspec import (
15
- FsspecDestinationConnector,
16
- FsspecIngestDoc,
17
- FsspecSourceConnector,
18
- FsspecWriteConfig,
19
- SimpleFsspecConfig,
20
- )
21
- from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
22
- from unstructured_ingest.interfaces import AccessConfig
23
- from unstructured_ingest.logger import logger
24
- from unstructured_ingest.utils.dep_check import requires_dependencies
25
-
26
-
27
- class AccessTokenError(Exception):
28
- """There is a problem with the Access Token."""
29
-
30
-
31
- @dataclass
32
- class BoxWriteConfig(FsspecWriteConfig):
33
- pass
34
-
35
-
36
- @dataclass
37
- class BoxAccessConfig(AccessConfig):
38
- box_app_config: t.Optional[str] = None
39
-
40
-
41
- @dataclass
42
- class SimpleBoxConfig(SimpleFsspecConfig):
43
- access_config: BoxAccessConfig = None
44
-
45
- @requires_dependencies(["boxfs"], extras="box")
46
- def get_access_config(self) -> dict:
47
- # Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
48
- # because it is not serializable.
49
- from boxsdk import JWTAuth
50
-
51
- access_kwargs_with_oauth: dict[str, t.Any] = {
52
- "oauth": JWTAuth.from_settings_file(
53
- self.access_config.box_app_config,
54
- ),
55
- }
56
- access_config: dict[str, t.Any] = self.access_config.to_dict()
57
- access_config.pop("box_app_config", None)
58
- access_kwargs_with_oauth.update(access_config)
59
-
60
- return access_kwargs_with_oauth
61
-
62
-
63
- @dataclass
64
- class BoxIngestDoc(FsspecIngestDoc):
65
- connector_config: SimpleBoxConfig
66
- registry_name: str = "box"
67
-
68
- @SourceConnectionError.wrap
69
- @requires_dependencies(["boxfs", "fsspec"], extras="box")
70
- def get_file(self):
71
- super().get_file()
72
-
73
-
74
- @dataclass
75
- class BoxSourceConnector(FsspecSourceConnector):
76
- connector_config: SimpleBoxConfig
77
-
78
- @requires_dependencies(["boxfs"], extras="box")
79
- def check_connection(self):
80
- from boxfs import BoxFileSystem
81
-
82
- try:
83
- BoxFileSystem(**self.connector_config.get_access_config())
84
- except Exception as e:
85
- logger.error(f"failed to validate connection: {e}", exc_info=True)
86
- raise SourceConnectionError(f"failed to validate connection: {e}")
87
-
88
- def __post_init__(self):
89
- self.ingest_doc_cls: t.Type[BoxIngestDoc] = BoxIngestDoc
90
-
91
-
92
- @dataclass
93
- class BoxDestinationConnector(FsspecDestinationConnector):
94
- connector_config: SimpleBoxConfig
95
- write_config: BoxWriteConfig
96
-
97
- @requires_dependencies(["boxfs", "fsspec"], extras="box")
98
- def initialize(self):
99
- super().initialize()
100
-
101
- @requires_dependencies(["boxfs"], extras="box")
102
- def check_connection(self):
103
- from boxfs import BoxFileSystem
104
-
105
- try:
106
- BoxFileSystem(**self.connector_config.get_access_config())
107
- except Exception as e:
108
- logger.error(f"failed to validate connection: {e}", exc_info=True)
109
- raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -1,160 +0,0 @@
1
- """
2
- Dropbox Connector
3
- The Dropbox Connector presents a couple abnormal situations.
4
- 1) They don't have an unexpiring token
5
- 2) They require a forward slash `/` in front of the remote_file_path. This presents
6
- some real problems creating paths. When appending a path that begins with a
7
- forward slash to any path, whether using the / shorthand or joinpath, causes the
8
- starting path to disappear. So the `/` needs to be stripped off.
9
- 3) To list and get files from the root directory Dropbox you need a ""," ", or " /"
10
- """
11
-
12
- import re
13
- from dataclasses import dataclass
14
- from pathlib import Path
15
- from typing import Type
16
-
17
- from unstructured_ingest.connector.fsspec.fsspec import (
18
- FsspecDestinationConnector,
19
- FsspecIngestDoc,
20
- FsspecSourceConnector,
21
- FsspecWriteConfig,
22
- SimpleFsspecConfig,
23
- )
24
- from unstructured_ingest.enhanced_dataclass import enhanced_field
25
- from unstructured_ingest.error import SourceConnectionError
26
- from unstructured_ingest.interfaces import AccessConfig
27
- from unstructured_ingest.logger import logger
28
- from unstructured_ingest.utils.dep_check import requires_dependencies
29
-
30
-
31
- class MissingFolderError(Exception):
32
- """There is no folder by that name. For root try `dropbox:// /`"""
33
-
34
-
35
- @dataclass
36
- class DropboxAccessConfig(AccessConfig):
37
- token: str = enhanced_field(sensitive=True)
38
-
39
-
40
- @dataclass
41
- class DropboxWriteConfig(FsspecWriteConfig):
42
- pass
43
-
44
-
45
- @dataclass
46
- class SimpleDropboxConfig(SimpleFsspecConfig):
47
- access_config: DropboxAccessConfig = None
48
-
49
-
50
- @dataclass
51
- class DropboxIngestDoc(FsspecIngestDoc):
52
- connector_config: SimpleDropboxConfig
53
- registry_name: str = "dropbox"
54
-
55
- @SourceConnectionError.wrap
56
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
57
- def get_file(self):
58
- super().get_file()
59
-
60
- @property
61
- def _output_filename(self):
62
- # Dropbox requires a forward slash at the front of the folder path. This
63
- # creates some complications in path joining so a custom path is created here.
64
- # Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root
65
- if self.connector_config.dir_path == " ":
66
- return Path(self.processor_config.output_dir) / re.sub(
67
- "^/",
68
- "",
69
- f"{self.remote_file_path}.json",
70
- )
71
- else:
72
- return (
73
- Path(self.processor_config.output_dir)
74
- / f"{self.remote_file_path.replace(f'/{self.connector_config.dir_path}/', '')}.json"
75
- )
76
-
77
- def _tmp_download_file(self):
78
- # Dropbox requires a forward slash at the front of the folder path. This
79
- # creates some complications in path joining so a custom path is created here.
80
- # Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root
81
- download_dir: str = self.read_config.download_dir if self.read_config.download_dir else ""
82
- if not download_dir:
83
- return ""
84
- if self.connector_config.dir_path == " ":
85
- return Path(download_dir) / re.sub(
86
- "^/",
87
- "",
88
- self.remote_file_path,
89
- )
90
- else:
91
- return Path(download_dir) / self.remote_file_path.replace(
92
- f"/{self.connector_config.dir_path}/",
93
- "",
94
- )
95
-
96
-
97
- @dataclass
98
- class DropboxSourceConnector(FsspecSourceConnector):
99
- connector_config: SimpleDropboxConfig
100
-
101
- def __post_init__(self):
102
- self.ingest_doc_cls: Type[DropboxIngestDoc] = DropboxIngestDoc
103
-
104
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
105
- def initialize(self):
106
- from fsspec import AbstractFileSystem, get_filesystem_class
107
-
108
- try:
109
- self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
110
- **self.connector_config.get_access_config(),
111
- )
112
- # Dropbox requires a forward slash at the front of the folder path. This
113
- # creates some complications in path joining so a custom path is created here.
114
- ls_output = self.fs.ls(f"/{self.connector_config.path_without_protocol}")
115
- except Exception as e:
116
- logger.error(f"failed to validate connection: {e}", exc_info=True)
117
- raise SourceConnectionError(f"failed to validate connection: {e}")
118
- if ls_output and len(ls_output) >= 1:
119
- return
120
- elif ls_output:
121
- raise ValueError(
122
- f"No objects found in {self.connector_config.remote_url}.",
123
- )
124
- else:
125
- raise MissingFolderError(
126
- "There is no folder by that name. For root try `dropbox:// /`",
127
- )
128
-
129
- def _list_files(self):
130
- # Dropbox requires a forward slash at the front of the folder path. This
131
- # creates some complications in path joining so a custom path is created here.
132
- if not self.connector_config.recursive:
133
- # fs.ls does not walk directories
134
- # directories that are listed in cloud storage can cause problems because they are seen
135
- # as 0byte files
136
- return [
137
- x.get("name")
138
- for x in self.fs.ls(
139
- f"/{self.connector_config.path_without_protocol}",
140
- detail=True,
141
- )
142
- if x.get("size")
143
- ]
144
- else:
145
- # fs.find will recursively walk directories
146
- # "size" is a common key for all the cloud protocols with fs
147
- return [
148
- k
149
- for k, v in self.fs.find(
150
- f"/{self.connector_config.path_without_protocol}",
151
- detail=True,
152
- ).items()
153
- if v.get("size")
154
- ]
155
-
156
-
157
- @dataclass
158
- class DropboxDestinationConnector(FsspecDestinationConnector):
159
- connector_config: SimpleFsspecConfig
160
- write_config: DropboxWriteConfig
@@ -1,359 +0,0 @@
1
- import fnmatch
2
- import json
3
- import os
4
- import typing as t
5
- from abc import ABC
6
- from contextlib import suppress
7
- from dataclasses import dataclass
8
- from pathlib import Path, PurePath
9
-
10
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
11
- from unstructured_ingest.error import (
12
- DestinationConnectionError,
13
- SourceConnectionError,
14
- SourceConnectionNetworkError,
15
- )
16
- from unstructured_ingest.interfaces import (
17
- BaseConnectorConfig,
18
- BaseDestinationConnector,
19
- BaseSingleIngestDoc,
20
- BaseSourceConnector,
21
- FsspecConfig,
22
- IngestDocCleanupMixin,
23
- SourceConnectorCleanupMixin,
24
- SourceMetadata,
25
- WriteConfig,
26
- )
27
- from unstructured_ingest.logger import logger
28
- from unstructured_ingest.utils.compression import (
29
- TAR_FILE_EXT,
30
- ZIP_FILE_EXT,
31
- CompressionSourceConnectorMixin,
32
- )
33
- from unstructured_ingest.utils.dep_check import (
34
- requires_dependencies,
35
- )
36
-
37
- SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
38
- "s3",
39
- "s3a",
40
- "abfs",
41
- "az",
42
- "gs",
43
- "gcs",
44
- "box",
45
- "dropbox",
46
- "sftp",
47
- ]
48
-
49
-
50
- @dataclass
51
- class SimpleFsspecConfig(FsspecConfig, BaseConnectorConfig):
52
- pass
53
-
54
-
55
- @dataclass
56
- class FsspecIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
57
- """Class encapsulating fetching a doc and writing processed results (but not
58
- doing the processing!).
59
-
60
- Also includes a cleanup method. When things go wrong and the cleanup
61
- method is not called, the file is left behind on the filesystem to assist debugging.
62
- """
63
-
64
- connector_config: SimpleFsspecConfig
65
- remote_file_path: str
66
-
67
- def _tmp_download_file(self):
68
- download_dir = self.read_config.download_dir if self.read_config.download_dir else ""
69
- return Path(download_dir) / self.remote_file_path.replace(
70
- f"{self.connector_config.dir_path}/",
71
- "",
72
- )
73
-
74
- @property
75
- def _output_filename(self):
76
- # Dynamically parse filename , can change if remote path was pointing to the single
77
- # file, a directory, or nested directory
78
- if self.remote_file_path == self.connector_config.path_without_protocol:
79
- file = self.remote_file_path.split("/")[-1]
80
- filename = f"{file}.json"
81
- else:
82
- path_without_protocol = (
83
- self.connector_config.path_without_protocol
84
- if self.connector_config.path_without_protocol.endswith("/")
85
- else f"{self.connector_config.path_without_protocol}/"
86
- )
87
- filename = f"{self.remote_file_path.replace(path_without_protocol, '')}.json"
88
- return Path(self.processor_config.output_dir) / filename
89
-
90
- def _create_full_tmp_dir_path(self):
91
- """Includes "directories" in the object path"""
92
- self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
93
-
94
- @SourceConnectionError.wrap
95
- @BaseSingleIngestDoc.skip_if_file_exists
96
- def get_file(self):
97
- """Fetches the file from the current filesystem and stores it locally."""
98
- from fsspec import AbstractFileSystem, get_filesystem_class
99
-
100
- self._create_full_tmp_dir_path()
101
- fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
102
- **self.connector_config.get_access_config(),
103
- )
104
- self._get_file(fs=fs)
105
- fs.get(rpath=self.remote_file_path, lpath=self._tmp_download_file().as_posix())
106
- self.update_source_metadata()
107
-
108
- @SourceConnectionNetworkError.wrap
109
- def _get_file(self, fs):
110
- fs.get(rpath=self.remote_file_path, lpath=self._tmp_download_file().as_posix())
111
-
112
- @requires_dependencies(["fsspec"])
113
- def update_source_metadata(self):
114
- from fsspec import AbstractFileSystem, get_filesystem_class
115
-
116
- fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
117
- **self.connector_config.get_access_config(),
118
- )
119
-
120
- date_created = None
121
- with suppress(NotImplementedError):
122
- date_created = fs.created(self.remote_file_path).isoformat()
123
-
124
- date_modified = None
125
- with suppress(NotImplementedError):
126
- date_modified = fs.modified(self.remote_file_path).isoformat()
127
-
128
- version = (
129
- fs.checksum(self.remote_file_path)
130
- if self.connector_config.protocol != "gs"
131
- else fs.info(self.remote_file_path).get("etag", "")
132
- )
133
- file_exists = fs.exists(self.remote_file_path)
134
- self.source_metadata = SourceMetadata(
135
- date_created=date_created,
136
- date_modified=date_modified,
137
- version=str(version),
138
- source_url=f"{self.connector_config.protocol}://{self.remote_file_path}",
139
- exists=file_exists,
140
- )
141
-
142
- @property
143
- def filename(self):
144
- """The filename of the file after downloading from cloud"""
145
- return self._tmp_download_file()
146
-
147
- @property
148
- def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
149
- """Returns the equivalent of ls in dict"""
150
- return {
151
- "protocol": self.connector_config.protocol,
152
- "remote_file_path": self.remote_file_path,
153
- }
154
-
155
-
156
- @dataclass
157
- class FsspecSourceConnector(
158
- SourceConnectorCleanupMixin,
159
- CompressionSourceConnectorMixin,
160
- BaseSourceConnector,
161
- ):
162
- """Objects of this class support fetching document(s) from"""
163
-
164
- connector_config: SimpleFsspecConfig
165
-
166
- def check_connection(self):
167
- from fsspec import get_filesystem_class
168
-
169
- try:
170
- fs = get_filesystem_class(self.connector_config.protocol)(
171
- **self.connector_config.get_access_config(),
172
- )
173
- fs.ls(path=self.connector_config.path_without_protocol, detail=False)
174
- except Exception as e:
175
- logger.error(f"failed to validate connection: {e}", exc_info=True)
176
- raise SourceConnectionError(f"failed to validate connection: {e}")
177
-
178
- def __post_init__(self):
179
- self.ingest_doc_cls: t.Type[FsspecIngestDoc] = FsspecIngestDoc
180
-
181
- def initialize(self):
182
- from fsspec import AbstractFileSystem, get_filesystem_class
183
-
184
- self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
185
- **self.connector_config.get_access_config(),
186
- )
187
-
188
- """Verify that can get metadata for an object, validates connections info."""
189
- ls_output = self.fs.ls(self.connector_config.path_without_protocol, detail=False)
190
- if len(ls_output) < 1:
191
- raise ValueError(
192
- f"No objects found in {self.connector_config.remote_url}.",
193
- )
194
-
195
- def _list_files(self):
196
- if not self.connector_config.recursive:
197
- # fs.ls does not walk directories
198
- # directories that are listed in cloud storage can cause problems
199
- # because they are seen as 0 byte files
200
- return [
201
- x.get("name")
202
- for x in self.fs.ls(self.connector_config.path_without_protocol, detail=True)
203
- if x.get("size") > 0
204
- ]
205
- else:
206
- # fs.find will recursively walk directories
207
- # "size" is a common key for all the cloud protocols with fs
208
- return [
209
- k
210
- for k, v in self.fs.find(
211
- self.connector_config.path_without_protocol,
212
- detail=True,
213
- ).items()
214
- if v.get("size") > 0
215
- ]
216
-
217
- def does_path_match_glob(self, path: str) -> bool:
218
- if self.connector_config.file_glob is None:
219
- return True
220
- patterns = self.connector_config.file_glob
221
- for pattern in patterns:
222
- if fnmatch.filter([path], pattern):
223
- return True
224
- logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
225
- return False
226
-
227
- def get_ingest_docs(self):
228
- raw_files = self._list_files()
229
- # If glob filters provided, use to filter on filepaths
230
- files = [f for f in raw_files if self.does_path_match_glob(f)]
231
- # remove compressed files
232
- compressed_file_ext = TAR_FILE_EXT + ZIP_FILE_EXT
233
- compressed_files = []
234
- uncompressed_files = []
235
- docs: t.List[BaseSingleIngestDoc] = []
236
- for file in files:
237
- if any(file.endswith(ext) for ext in compressed_file_ext):
238
- compressed_files.append(file)
239
- else:
240
- uncompressed_files.append(file)
241
- docs.extend(
242
- [
243
- self.ingest_doc_cls(
244
- read_config=self.read_config,
245
- connector_config=self.connector_config,
246
- processor_config=self.processor_config,
247
- remote_file_path=file,
248
- )
249
- for file in uncompressed_files
250
- ],
251
- )
252
- if not self.connector_config.uncompress:
253
- return docs
254
- for compressed_file in compressed_files:
255
- compressed_doc = self.ingest_doc_cls(
256
- read_config=self.read_config,
257
- processor_config=self.processor_config,
258
- connector_config=self.connector_config,
259
- remote_file_path=compressed_file,
260
- )
261
- try:
262
- local_ingest_docs = self.process_compressed_doc(doc=compressed_doc)
263
- logger.info(f"adding {len(local_ingest_docs)} from {compressed_file}")
264
- docs.extend(local_ingest_docs)
265
- finally:
266
- compressed_doc.cleanup_file()
267
- return docs
268
-
269
-
270
- @dataclass
271
- class WriteTextConfig(EnhancedDataClassJsonMixin, ABC):
272
- pass
273
-
274
-
275
- @dataclass
276
- class FsspecWriteConfig(WriteConfig):
277
- write_text_config: t.Optional[WriteTextConfig] = None
278
-
279
- def get_write_text_config(self) -> t.Dict[str, t.Any]:
280
- if write_text_kwargs := self.write_text_config:
281
- return write_text_kwargs.to_dict()
282
- return {}
283
-
284
-
285
- @dataclass
286
- class FsspecDestinationConnector(BaseDestinationConnector):
287
- connector_config: SimpleFsspecConfig
288
- write_config: FsspecWriteConfig
289
-
290
- def initialize(self):
291
- from fsspec import AbstractFileSystem, get_filesystem_class
292
-
293
- self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
294
- **self.connector_config.get_access_config(),
295
- )
296
- self.check_connection()
297
-
298
- def check_connection(self):
299
- from fsspec import AbstractFileSystem, get_filesystem_class
300
-
301
- try:
302
- fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
303
- **self.connector_config.get_access_config(),
304
- )
305
-
306
- # e.g. Dropbox path starts with /
307
- bucket_name = "/" if self.connector_config.path_without_protocol.startswith("/") else ""
308
- bucket_name += self.connector_config.dir_path.split("/")[0]
309
-
310
- logger.info(f"checking connection for destination {bucket_name}")
311
- fs.ls(path=bucket_name, detail=False)
312
- except Exception as e:
313
- logger.error(f"failed to validate connection: {e}", exc_info=True)
314
- raise DestinationConnectionError(f"failed to validate connection: {e}")
315
-
316
- def write_dict(
317
- self,
318
- *args,
319
- elements_dict: t.List[t.Dict[str, t.Any]],
320
- filename: t.Optional[str] = None,
321
- indent: int = 4,
322
- encoding: str = "utf-8",
323
- **kwargs,
324
- ) -> None:
325
- from fsspec import AbstractFileSystem, get_filesystem_class
326
-
327
- fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
328
- **self.connector_config.get_access_config(),
329
- )
330
-
331
- logger.info(f"writing content using filesystem: {type(fs).__name__}")
332
-
333
- output_folder = self.connector_config.path_without_protocol
334
- output_folder = os.path.join(output_folder) # Make sure folder ends with file separator
335
- filename = (
336
- filename.strip(os.sep) if filename else filename
337
- ) # Make sure filename doesn't begin with file separator
338
- output_path = str(PurePath(output_folder, filename)) if filename else output_folder
339
- full_output_path = f"{self.connector_config.protocol}://{output_path}"
340
- logger.debug(f"uploading content to {full_output_path}")
341
- write_text_configs = self.write_config.get_write_text_config() if self.write_config else {}
342
- fs.write_text(
343
- full_output_path,
344
- json.dumps(elements_dict, indent=indent),
345
- encoding=encoding,
346
- **write_text_configs,
347
- )
348
-
349
- def get_elements_dict(self, docs: t.List[BaseSingleIngestDoc]) -> t.List[t.Dict[str, t.Any]]:
350
- pass
351
-
352
- def write(self, docs: t.List[BaseSingleIngestDoc]) -> None:
353
- for doc in docs:
354
- file_path = doc.base_output_filename
355
- filename = file_path if file_path else None
356
- with open(doc._output_filename) as json_file:
357
- logger.debug(f"uploading content from {doc._output_filename}")
358
- json_list = json.load(json_file)
359
- self.write_dict(elements_dict=json_list, filename=filename)