unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +49 -0
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/processes/connectors/github.py +221 -0
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,285 +0,0 @@
1
- import hashlib
2
- import os
3
- import typing as t
4
- from collections import defaultdict
5
- from dataclasses import dataclass, field
6
- from itertools import chain
7
- from pathlib import Path
8
-
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
- from unstructured_ingest.interfaces import (
12
- AccessConfig,
13
- BaseConnectorConfig,
14
- BaseSingleIngestDoc,
15
- BaseSourceConnector,
16
- IngestDocCleanupMixin,
17
- SourceConnectorCleanupMixin,
18
- SourceMetadata,
19
- )
20
- from unstructured_ingest.logger import logger
21
- from unstructured_ingest.utils.dep_check import requires_dependencies
22
-
23
- MAX_NUM_EMAILS = 1000000 # Maximum number of emails per folder
24
- if t.TYPE_CHECKING:
25
- from office365.graph_client import GraphClient
26
-
27
-
28
- class MissingFolderError(Exception):
29
- """There are no root folders with those names."""
30
-
31
-
32
- @dataclass
33
- class OutlookAccessConfig(AccessConfig):
34
- client_credential: str = enhanced_field(repr=False, sensitive=True, overload_name="client_cred")
35
-
36
-
37
- @dataclass
38
- class SimpleOutlookConfig(BaseConnectorConfig):
39
- """This class is getting the token."""
40
-
41
- access_config: OutlookAccessConfig
42
- user_email: str
43
- client_id: str
44
- tenant: t.Optional[str] = field(repr=False, default="common")
45
- authority_url: t.Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
46
- outlook_folders: t.List[str] = field(default_factory=list)
47
- recursive: bool = False
48
- registry_name: str = "outlook"
49
-
50
- def __post_init__(self):
51
- if not (self.client_id and self.access_config.client_credential and self.user_email):
52
- raise ValueError(
53
- "Please provide one of the following mandatory values:"
54
- "\nclient_id\nclient_cred\nuser_email",
55
- )
56
- self.token_factory = self._acquire_token
57
-
58
- @requires_dependencies(["msal"])
59
- def _acquire_token(self):
60
- from msal import ConfidentialClientApplication
61
-
62
- try:
63
- app = ConfidentialClientApplication(
64
- authority=f"{self.authority_url}/{self.tenant}",
65
- client_id=self.client_id,
66
- client_credential=self.access_config.client_credential,
67
- )
68
- token = app.acquire_token_for_client(
69
- scopes=["https://graph.microsoft.com/.default"],
70
- )
71
- except ValueError as exc:
72
- logger.error("Couldn't set up credentials for Outlook")
73
- raise exc
74
- return token
75
-
76
- @requires_dependencies(["office365"], extras="outlook")
77
- def _get_client(self):
78
- from office365.graph_client import GraphClient
79
-
80
- return GraphClient(self.token_factory)
81
-
82
-
83
- @dataclass
84
- class OutlookIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
85
- connector_config: SimpleOutlookConfig
86
- message_id: str
87
- registry_name: str = "outlook"
88
-
89
- def __post_init__(self):
90
- self._set_download_paths()
91
-
92
- def hash_mail_name(self, id):
93
- """Outlook email ids are 152 char long. Hash to shorten to 16."""
94
- return hashlib.sha256(id.encode("utf-8")).hexdigest()[:16]
95
-
96
- def _set_download_paths(self) -> None:
97
- """Creates paths for downloading and parsing."""
98
- download_path = Path(f"{self.read_config.download_dir}")
99
- output_path = Path(f"{self.processor_config.output_dir}")
100
-
101
- self.download_dir = download_path
102
- self.download_filepath = (
103
- download_path / f"{self.hash_mail_name(self.message_id)}.eml"
104
- ).resolve()
105
- oname = f"{self.hash_mail_name(self.message_id)}.eml.json"
106
- self.output_dir = output_path
107
- self.output_filepath = (output_path / oname).resolve()
108
-
109
- @property
110
- def filename(self):
111
- return Path(self.download_filepath).resolve()
112
-
113
- @property
114
- def _output_filename(self):
115
- return Path(self.output_filepath).resolve()
116
-
117
- @property
118
- def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
119
- return {
120
- "message_id": self.message_id,
121
- "user_email": self.connector_config.user_email,
122
- }
123
-
124
- @requires_dependencies(["office365"], extras="outlook")
125
- def update_source_metadata(self, **kwargs):
126
- from office365.runtime.client_request_exception import ClientRequestException
127
-
128
- try:
129
- client = self.connector_config._get_client()
130
- msg = (
131
- client.users[self.connector_config.user_email]
132
- .messages[self.message_id]
133
- .get()
134
- .execute_query()
135
- )
136
- except ClientRequestException as e:
137
- if e.response.status_code == 404:
138
- self.source_metadata = SourceMetadata(
139
- exists=False,
140
- )
141
- return
142
- raise
143
- self.source_metadata = SourceMetadata(
144
- date_created=msg.created_datetime.isoformat(),
145
- date_modified=msg.last_modified_datetime.isoformat(),
146
- version=msg.get_property("changeKey"),
147
- source_url=msg.get_property("webLink"),
148
- exists=True,
149
- )
150
-
151
- @SourceConnectionNetworkError.wrap
152
- def _run_download(self, local_file):
153
- client = self.connector_config._get_client()
154
- client.users[self.connector_config.user_email].messages[self.message_id].download(
155
- local_file,
156
- ).execute_query()
157
-
158
- @SourceConnectionError.wrap
159
- @BaseSingleIngestDoc.skip_if_file_exists
160
- @requires_dependencies(["office365"], extras="outlook")
161
- def get_file(self):
162
- """Relies on Office365 python sdk message object to do the download."""
163
- try:
164
- self.connector_config._get_client()
165
- self.update_source_metadata()
166
- if not self.download_dir.is_dir():
167
- logger.debug(f"creating directory: {self.download_dir}")
168
- self.download_dir.mkdir(parents=True, exist_ok=True)
169
-
170
- with open(
171
- os.path.join(
172
- self.download_dir,
173
- self.hash_mail_name(self.message_id) + ".eml",
174
- ),
175
- "wb",
176
- ) as local_file:
177
- self._run_download(local_file=local_file)
178
-
179
- except Exception as e:
180
- logger.error(
181
- f"Error while downloading and saving file: {self.hash_mail_name(self.message_id)}.",
182
- )
183
- logger.error(e)
184
- return
185
- logger.info(f"file downloaded: {self.hash_mail_name(self.message_id)}")
186
- return
187
-
188
-
189
- @dataclass
190
- class OutlookSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
191
- connector_config: SimpleOutlookConfig
192
- _client: t.Optional["GraphClient"] = field(init=False, default=None)
193
-
194
- @property
195
- def client(self) -> "GraphClient":
196
- if self._client is None:
197
- self._client = self.connector_config._get_client()
198
- return self._client
199
-
200
- def initialize(self):
201
- try:
202
- self.get_folder_ids()
203
- except Exception as e:
204
- raise SourceConnectionError(f"failed to validate connection: {e}")
205
-
206
- def check_connection(self):
207
- try:
208
- _ = self.client
209
- except Exception as e:
210
- logger.error(f"failed to validate connection: {e}", exc_info=True)
211
- raise SourceConnectionError(f"failed to validate connection: {e}")
212
-
213
- def recurse_folders(self, folder_id, main_folder_dict):
214
- """We only get a count of subfolders for any folder.
215
- Have to make additional calls to get subfolder ids."""
216
- subfolders = (
217
- self.client.users[self.connector_config.user_email]
218
- .mail_folders[folder_id]
219
- .child_folders.get()
220
- .execute_query()
221
- )
222
- for subfolder in subfolders:
223
- for k, v in main_folder_dict.items():
224
- if subfolder.get_property("parentFolderId") in v:
225
- v.append(subfolder.id)
226
- if subfolder.get_property("childFolderCount") > 0:
227
- self.recurse_folders(subfolder.id, main_folder_dict)
228
-
229
- def get_folder_ids(self):
230
- """Sets the mail folder ids and subfolder ids for requested root mail folders."""
231
- self.root_folders = defaultdict(list)
232
- root_folders_with_subfolders = []
233
- get_root_folders = (
234
- self.client.users[self.connector_config.user_email].mail_folders.get().execute_query()
235
- )
236
-
237
- for folder in get_root_folders:
238
- self.root_folders[folder.display_name].append(folder.id)
239
- if folder.get_property("childFolderCount") > 0:
240
- root_folders_with_subfolders.append(folder.id)
241
-
242
- for folder in root_folders_with_subfolders:
243
- self.recurse_folders(folder, self.root_folders)
244
-
245
- # Narrow down all mail folder ids (plus all subfolders) to the ones that were requested.
246
- self.selected_folder_ids = list(
247
- chain.from_iterable(
248
- [
249
- v
250
- for k, v in self.root_folders.items()
251
- if k.lower() in [x.lower() for x in self.connector_config.outlook_folders]
252
- ],
253
- ),
254
- )
255
- if not self.selected_folder_ids:
256
- raise MissingFolderError(
257
- "There are no root folders with the names: "
258
- f"{self.connector_config.outlook_folders}",
259
- )
260
-
261
- def get_ingest_docs(self):
262
- """Returns a list of all the message objects that are in the requested root folder(s)."""
263
- filtered_messages = []
264
-
265
- # Get all the relevant messages in the selected folders/subfolders.
266
- for folder_id in self.selected_folder_ids:
267
- messages = (
268
- self.client.users[self.connector_config.user_email]
269
- .mail_folders[folder_id]
270
- .messages.get()
271
- .top(MAX_NUM_EMAILS) # Prevents the return from paging
272
- .execute_query()
273
- )
274
- # Skip empty list if there are no messages in folder.
275
- if messages:
276
- filtered_messages.append(messages)
277
- return [
278
- OutlookIngestDoc(
279
- connector_config=self.connector_config,
280
- processor_config=self.processor_config,
281
- read_config=self.read_config,
282
- message_id=message.id,
283
- )
284
- for message in list(chain.from_iterable(filtered_messages))
285
- ]
@@ -1,150 +0,0 @@
1
- import copy
2
- import json
3
- import multiprocessing as mp
4
- import typing as t
5
- import uuid
6
- from dataclasses import dataclass
7
-
8
- from unstructured_ingest.enhanced_dataclass import enhanced_field
9
- from unstructured_ingest.enhanced_dataclass.core import _asdict
10
- from unstructured_ingest.error import DestinationConnectionError, WriteError
11
- from unstructured_ingest.interfaces import (
12
- AccessConfig,
13
- BaseConnectorConfig,
14
- BaseDestinationConnector,
15
- ConfigSessionHandleMixin,
16
- IngestDocSessionHandleMixin,
17
- WriteConfig,
18
- )
19
- from unstructured_ingest.logger import logger
20
- from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
21
- from unstructured_ingest.utils.dep_check import requires_dependencies
22
-
23
- if t.TYPE_CHECKING:
24
- from pinecone import Index as PineconeIndex
25
-
26
-
27
- @dataclass
28
- class PineconeAccessConfig(AccessConfig):
29
- api_key: str = enhanced_field(sensitive=True)
30
-
31
-
32
- @dataclass
33
- class SimplePineconeConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
34
- index_name: str
35
- environment: str
36
- access_config: PineconeAccessConfig
37
-
38
-
39
- @dataclass
40
- class PineconeWriteConfig(WriteConfig):
41
- batch_size: int = 50
42
- num_processes: int = 1
43
-
44
-
45
- @dataclass
46
- class PineconeDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConnector):
47
- write_config: PineconeWriteConfig
48
- connector_config: SimplePineconeConfig
49
- _index: t.Optional["PineconeIndex"] = None
50
-
51
- def to_dict(self, **kwargs):
52
- """
53
- The _index variable in this dataclass breaks deepcopy due to:
54
- TypeError: cannot pickle '_thread.lock' object
55
- When serializing, remove it, meaning client data will need to be reinitialized
56
- when deserialized
57
- """
58
- self_cp = copy.copy(self)
59
- if hasattr(self_cp, "_index"):
60
- setattr(self_cp, "_index", None)
61
- return _asdict(self_cp, **kwargs)
62
-
63
- @property
64
- def pinecone_index(self):
65
- if self._index is None:
66
- self._index = self.get_index()
67
- return self._index
68
-
69
- def initialize(self):
70
- pass
71
-
72
- @requires_dependencies(["pinecone"], extras="pinecone")
73
- def get_index(self) -> "PineconeIndex":
74
- from pinecone import Pinecone
75
- from unstructured import __version__ as unstructured_version
76
-
77
- pc = Pinecone(
78
- api_key=self.connector_config.access_config.api_key,
79
- source_tag=f"unstructured=={unstructured_version}",
80
- )
81
-
82
- index = pc.Index(self.connector_config.index_name)
83
- logger.debug(f"connected to index: {pc.describe_index(self.connector_config.index_name)}")
84
- return index
85
-
86
- @requires_dependencies(["pinecone"], extras="pinecone")
87
- def create_index(self) -> "PineconeIndex":
88
- logger.warning(
89
- "create_index (a misleading name as of now) will be deprecated soon. "
90
- + "Use get_index instead. This is due to unstructured supporting actual "
91
- + "index creation/provisioning now. "
92
- + "(Support for v2 connectors only. you are currently using a v1 connector.)"
93
- )
94
- return self.get_index()
95
-
96
- @DestinationConnectionError.wrap
97
- def check_connection(self):
98
- _ = self.pinecone_index
99
-
100
- @DestinationConnectionError.wrap
101
- @requires_dependencies(["pinecone"], extras="pinecone")
102
- def upsert_batch(self, batch):
103
- import pinecone.core.client.exceptions
104
-
105
- index = self.pinecone_index
106
- try:
107
- response = index.upsert(batch)
108
- except pinecone.core.client.exceptions.ApiException as api_error:
109
- raise WriteError(f"http error: {api_error}") from api_error
110
- logger.debug(f"results: {response}")
111
-
112
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
113
- logger.info(
114
- f"Upserting {len(elements_dict)} elements to destination "
115
- f"index at {self.connector_config.index_name}",
116
- )
117
-
118
- pinecone_batch_size = self.write_config.batch_size
119
-
120
- logger.info(f"using {self.write_config.num_processes} processes to upload")
121
- if self.write_config.num_processes == 1:
122
- for chunk in batch_generator(elements_dict, pinecone_batch_size):
123
- self.upsert_batch(chunk) # noqa: E203
124
-
125
- else:
126
- with mp.Pool(
127
- processes=self.write_config.num_processes,
128
- ) as pool:
129
- pool.map(
130
- self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
131
- )
132
-
133
- def normalize_dict(self, element_dict: dict) -> dict:
134
- # While flatten_dict enables indexing on various fields,
135
- # element_serialized enables easily reloading the element object to memory.
136
- # element_serialized is formed without text/embeddings to avoid data bloating.
137
- return {
138
- "id": str(uuid.uuid4()),
139
- "values": element_dict.pop("embeddings", None),
140
- "metadata": {
141
- "text": element_dict.pop("text", None),
142
- "element_serialized": json.dumps(element_dict),
143
- **flatten_dict(
144
- element_dict,
145
- separator="-",
146
- flatten_lists=True,
147
- remove_none=True,
148
- ),
149
- },
150
- }
@@ -1,144 +0,0 @@
1
- import json
2
- import multiprocessing as mp
3
- import typing as t
4
- import uuid
5
- from dataclasses import dataclass
6
-
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
- from unstructured_ingest.error import DestinationConnectionError, WriteError
9
- from unstructured_ingest.interfaces import (
10
- AccessConfig,
11
- BaseConnectorConfig,
12
- BaseDestinationConnector,
13
- ConfigSessionHandleMixin,
14
- IngestDocSessionHandleMixin,
15
- WriteConfig,
16
- )
17
- from unstructured_ingest.logger import logger
18
- from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
19
- from unstructured_ingest.utils.dep_check import requires_dependencies
20
-
21
- if t.TYPE_CHECKING:
22
- from qdrant_client import QdrantClient
23
-
24
-
25
- @dataclass
26
- class QdrantAccessConfig(AccessConfig):
27
- api_key: t.Optional[str] = enhanced_field(sensitive=True)
28
-
29
-
30
- @dataclass
31
- class SimpleQdrantConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
32
- collection_name: str
33
- location: t.Optional[str] = None
34
- url: t.Optional[str] = None
35
- port: t.Optional[int] = 6333
36
- grpc_port: t.Optional[int] = 6334
37
- prefer_grpc: t.Optional[bool] = False
38
- https: t.Optional[bool] = None
39
- prefix: t.Optional[str] = None
40
- timeout: t.Optional[float] = None
41
- host: t.Optional[str] = None
42
- path: t.Optional[str] = None
43
- force_disable_check_same_thread: t.Optional[bool] = False
44
- access_config: t.Optional[QdrantAccessConfig] = None
45
-
46
-
47
- @dataclass
48
- class QdrantWriteConfig(WriteConfig):
49
- batch_size: int = 50
50
- num_processes: int = 1
51
-
52
-
53
- @dataclass
54
- class QdrantDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConnector):
55
- write_config: QdrantWriteConfig
56
- connector_config: SimpleQdrantConfig
57
- _client: t.Optional["QdrantClient"] = None
58
-
59
- @property
60
- def qdrant_client(self):
61
- if self._client is None:
62
- self._client = self.create_client()
63
- return self._client
64
-
65
- def initialize(self):
66
- ... # fmt: skip
67
-
68
- @requires_dependencies(["qdrant_client"], extras="qdrant")
69
- def create_client(self) -> "QdrantClient":
70
- from qdrant_client import QdrantClient
71
-
72
- client = QdrantClient(
73
- location=self.connector_config.location,
74
- url=self.connector_config.url,
75
- port=self.connector_config.port,
76
- grpc_port=self.connector_config.grpc_port,
77
- prefer_grpc=self.connector_config.prefer_grpc,
78
- https=self.connector_config.https,
79
- api_key=(
80
- self.connector_config.access_config.api_key
81
- if self.connector_config.access_config
82
- else None
83
- ),
84
- prefix=self.connector_config.prefix,
85
- timeout=self.connector_config.timeout,
86
- host=self.connector_config.host,
87
- path=self.connector_config.path,
88
- force_disable_check_same_thread=self.connector_config.force_disable_check_same_thread,
89
- )
90
-
91
- return client
92
-
93
- @DestinationConnectionError.wrap
94
- def check_connection(self):
95
- self.qdrant_client.get_collections()
96
-
97
- @DestinationConnectionError.wrap
98
- @requires_dependencies(["qdrant_client"], extras="qdrant")
99
- def upsert_batch(self, batch: t.List[t.Dict[str, t.Any]]):
100
- from qdrant_client import models
101
-
102
- client = self.qdrant_client
103
- try:
104
- points: list[models.PointStruct] = [models.PointStruct(**item) for item in batch]
105
- response = client.upsert(
106
- self.connector_config.collection_name, points=points, wait=True
107
- )
108
- except Exception as api_error:
109
- raise WriteError(f"Qdrant error: {api_error}") from api_error
110
- logger.debug(f"results: {response}")
111
-
112
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
113
- logger.info(
114
- f"Upserting {len(elements_dict)} elements to "
115
- f"{self.connector_config.collection_name}",
116
- )
117
-
118
- qdrant_batch_size = self.write_config.batch_size
119
-
120
- logger.info(f"using {self.write_config.num_processes} processes to upload")
121
- if self.write_config.num_processes == 1:
122
- for chunk in batch_generator(elements_dict, qdrant_batch_size):
123
- self.upsert_batch(chunk)
124
-
125
- else:
126
- with mp.Pool(
127
- processes=self.write_config.num_processes,
128
- ) as pool:
129
- pool.map(self.upsert_batch, list(batch_generator(elements_dict, qdrant_batch_size)))
130
-
131
- def normalize_dict(self, element_dict: dict) -> dict:
132
- return {
133
- "id": str(uuid.uuid4()),
134
- "vector": element_dict.pop("embeddings", {}),
135
- "payload": {
136
- "text": element_dict.pop("text", None),
137
- "element_serialized": json.dumps(element_dict),
138
- **flatten_dict(
139
- element_dict,
140
- separator="-",
141
- flatten_lists=True,
142
- ),
143
- },
144
- }