unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,26 +0,0 @@
1
- # https://developers.notion.com/reference/property-value-object#date-property-values
2
- from dataclasses import dataclass
3
- from typing import Optional
4
-
5
- from htmlBuilder.tags import Div, HtmlTag
6
-
7
- from unstructured_ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin
8
-
9
-
10
- @dataclass
11
- class Date(FromJSONMixin, GetHTMLMixin):
12
- start: str
13
- end: Optional[str] = None
14
- time_zone: Optional[str] = None
15
-
16
- @classmethod
17
- def from_dict(cls, data: dict):
18
- return cls(**data)
19
-
20
- def get_html(self) -> Optional[HtmlTag]:
21
- text = f"{self.start}"
22
- if end := self.end:
23
- text += f" - {end}"
24
- if self.time_zone:
25
- text += f" {self.time_zone}"
26
- return Div([], text)
@@ -1,51 +0,0 @@
1
- # https://developers.notion.com/reference/file-object
2
- from dataclasses import dataclass
3
- from typing import Optional
4
-
5
- from htmlBuilder.attributes import Href
6
- from htmlBuilder.tags import A, HtmlTag
7
-
8
- from unstructured_ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin
9
-
10
-
11
- @dataclass
12
- class External(FromJSONMixin):
13
- url: str
14
-
15
- @classmethod
16
- def from_dict(cls, data: dict):
17
- return cls(**data)
18
-
19
-
20
- @dataclass
21
- class File(FromJSONMixin):
22
- url: str
23
- expiry_time: str
24
-
25
- @classmethod
26
- def from_dict(cls, data: dict):
27
- return cls(**data)
28
-
29
-
30
- @dataclass
31
- class FileObject(FromJSONMixin, GetHTMLMixin):
32
- type: str
33
- external: Optional[External] = None
34
- file: Optional[File] = None
35
-
36
- @classmethod
37
- def from_dict(cls, data: dict):
38
- t = data["type"]
39
- file_object = cls(type=t)
40
- if t == "external":
41
- file_object.external = External.from_dict(data["external"])
42
- elif t == "file":
43
- file_object.file = File.from_dict(data["file"])
44
- return file_object
45
-
46
- def get_html(self) -> Optional[HtmlTag]:
47
- if self.file:
48
- return A([Href(self.file.url)], self.file.url)
49
- if self.external:
50
- return A([Href(self.external.url)], self.external.url)
51
- return None
@@ -1,76 +0,0 @@
1
- # https://developers.notion.com/reference/user
2
- from dataclasses import dataclass, field
3
- from typing import Optional
4
-
5
- from htmlBuilder.attributes import Href
6
- from htmlBuilder.tags import A, Div, HtmlTag
7
-
8
- from unstructured_ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin
9
-
10
-
11
- @dataclass
12
- class PartialUser(FromJSONMixin):
13
- id: str
14
- object: str = "user"
15
-
16
- @classmethod
17
- def from_dict(cls, data: dict):
18
- return cls(id=data["id"])
19
-
20
-
21
- @dataclass
22
- class User(FromJSONMixin, GetHTMLMixin):
23
- object: dict
24
- id: str
25
- type: Optional[str] = None
26
- name: Optional[str] = None
27
- avatar_url: Optional[str] = None
28
-
29
- @classmethod
30
- def from_dict(cls, data: dict):
31
- return cls(**data)
32
-
33
- def get_text(self) -> Optional[str]:
34
- text = self.name
35
- if self.avatar_url:
36
- text = f"[{text}]({self.avatar_url}"
37
- return text
38
-
39
- def get_html(self) -> Optional[HtmlTag]:
40
- if self.avatar_url:
41
- return A([Href(self.avatar_url)], self.name)
42
- else:
43
- return Div([], self.name)
44
-
45
-
46
- @dataclass
47
- class People(User):
48
- person: dict = field(default_factory=dict)
49
-
50
-
51
- @dataclass
52
- class Bots(FromJSONMixin, GetHTMLMixin):
53
- object: dict
54
- id: str
55
- bot: dict
56
- owner: dict
57
- type: str
58
- workspace_name: str
59
- name: Optional[str] = None
60
- avatar_url: Optional[str] = None
61
-
62
- @classmethod
63
- def from_dict(cls, data: dict):
64
- return cls(**data)
65
-
66
- def get_text(self) -> Optional[str]:
67
- text = self.name
68
- if self.avatar_url:
69
- text = f"[{text}]({self.avatar_url}"
70
- return text
71
-
72
- def get_html(self) -> Optional[HtmlTag]:
73
- if self.avatar_url:
74
- return A([Href(self.avatar_url)], self.name)
75
- else:
76
- return Div([], self.name)
@@ -1,232 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass, field
3
- from pathlib import Path
4
-
5
- from unstructured_ingest.enhanced_dataclass import enhanced_field
6
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
7
- from unstructured_ingest.interfaces import (
8
- AccessConfig,
9
- BaseConnectorConfig,
10
- BaseSingleIngestDoc,
11
- BaseSourceConnector,
12
- IngestDocCleanupMixin,
13
- SourceConnectorCleanupMixin,
14
- SourceMetadata,
15
- )
16
- from unstructured_ingest.logger import logger
17
- from unstructured_ingest.utils.dep_check import requires_dependencies
18
- from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime
19
-
20
- if t.TYPE_CHECKING:
21
- from office365.graph_client import GraphClient
22
- from office365.onedrive.driveitems.driveItem import DriveItem
23
- MAX_MB_SIZE = 512_000_000
24
-
25
-
26
- @dataclass
27
- class OneDriveAccessConfig(AccessConfig):
28
- client_credential: str = enhanced_field(repr=False, sensitive=True, overload_name="client_cred")
29
-
30
-
31
- @dataclass
32
- class SimpleOneDriveConfig(BaseConnectorConfig):
33
- access_config: OneDriveAccessConfig
34
- client_id: str
35
- user_pname: str
36
- tenant: str = field(repr=False)
37
- authority_url: t.Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
38
- path: t.Optional[str] = field(default="")
39
- recursive: bool = False
40
-
41
- def __post_init__(self):
42
- if not (self.client_id and self.access_config.client_credential and self.user_pname):
43
- raise ValueError(
44
- "Please provide all the following mandatory values:"
45
- "\n-ms-client_id\n-ms-client_cred\n-ms-user-pname",
46
- )
47
- self.token_factory = self._acquire_token
48
-
49
- @SourceConnectionError.wrap
50
- @requires_dependencies(["msal"])
51
- def _acquire_token(self):
52
- from msal import ConfidentialClientApplication
53
-
54
- try:
55
- app = ConfidentialClientApplication(
56
- authority=f"{self.authority_url}/{self.tenant}",
57
- client_id=self.client_id,
58
- client_credential=self.access_config.client_credential,
59
- )
60
- token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
61
- except ValueError as exc:
62
- logger.error("Couldn't set up credentials for OneDrive")
63
- raise exc
64
- return token
65
-
66
-
67
- @dataclass
68
- class OneDriveIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
69
- connector_config: SimpleOneDriveConfig
70
- file_name: str
71
- file_path: str
72
- registry_name: str = "onedrive"
73
-
74
- def __post_init__(self):
75
- self.ext = Path(self.file_name).suffix
76
- if not self.ext:
77
- raise ValueError("Unsupported file without extension.")
78
-
79
- self.server_relative_path = self.file_path + "/" + self.file_name
80
- self._set_download_paths()
81
-
82
- def _set_download_paths(self) -> None:
83
- """Parses the folder structure from the source and creates the download and output paths"""
84
- download_path = Path(f"{self.read_config.download_dir}")
85
- output_path = Path(f"{self.processor_config.output_dir}")
86
-
87
- if parent_path := self.file_path:
88
- download_path = (
89
- download_path if parent_path == "" else (download_path / parent_path).resolve()
90
- )
91
- output_path = (
92
- output_path if parent_path == "" else (output_path / parent_path).resolve()
93
- )
94
-
95
- self.download_dir = download_path
96
- self.download_filepath = (download_path / self.file_name).resolve()
97
- output_filename = output_filename = self.file_name + ".json"
98
- self.output_dir = output_path
99
- self.output_filepath = (output_path / output_filename).resolve()
100
-
101
- @property
102
- def filename(self):
103
- return Path(self.download_filepath).resolve()
104
-
105
- @property
106
- def _output_filename(self):
107
- return Path(self.output_filepath).resolve()
108
-
109
- @property
110
- def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
111
- return {
112
- "user_pname": self.connector_config.user_pname,
113
- "server_relative_path": self.server_relative_path,
114
- }
115
-
116
- @SourceConnectionNetworkError.wrap
117
- @requires_dependencies(["office365"], extras="onedrive")
118
- def _fetch_file(self):
119
- from office365.graph_client import GraphClient
120
-
121
- client = GraphClient(self.connector_config.token_factory)
122
- root = client.users[self.connector_config.user_pname].drive.get().execute_query().root
123
- file = root.get_by_path(self.server_relative_path).get().execute_query()
124
- return file
125
-
126
- def update_source_metadata(self, **kwargs):
127
- file = kwargs.get("file", self._fetch_file())
128
- if file is None:
129
- self.source_metadata = SourceMetadata(
130
- exists=False,
131
- )
132
- return
133
-
134
- version = None
135
- if (n_versions := len(file.versions)) > 0:
136
- version = file.versions[n_versions - 1].properties.get("id", None)
137
-
138
- self.source_metadata = SourceMetadata(
139
- date_created=ensure_isoformat_datetime(timestamp=file.created_datetime),
140
- date_modified=ensure_isoformat_datetime(timestamp=file.last_modified_datetime),
141
- version=version,
142
- source_url=file.parent_reference.path + "/" + self.file_name,
143
- exists=True,
144
- )
145
-
146
- @SourceConnectionError.wrap
147
- @BaseSingleIngestDoc.skip_if_file_exists
148
- def get_file(self):
149
- file = self._fetch_file()
150
- self.update_source_metadata(file=file)
151
- if file is None:
152
- raise ValueError(
153
- f"Failed to retrieve file {self.file_path}/{self.file_name}",
154
- )
155
-
156
- fsize = file.get_property("size", 0)
157
- self.output_dir.mkdir(parents=True, exist_ok=True)
158
-
159
- if not self.download_dir.is_dir():
160
- logger.debug(f"creating directory: {self.download_dir}")
161
- self.download_dir.mkdir(parents=True, exist_ok=True)
162
-
163
- if fsize > MAX_MB_SIZE:
164
- logger.info(f"downloading file with size: {fsize} bytes in chunks")
165
- with self.filename.open(mode="wb") as f:
166
- file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
167
- else:
168
- with self.filename.open(mode="wb") as f:
169
- file.download(f).execute_query()
170
- logger.info(f"file downloaded: {self.filename}")
171
- return
172
-
173
-
174
- @dataclass
175
- class OneDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
176
- connector_config: SimpleOneDriveConfig
177
- _client: t.Optional["GraphClient"] = field(init=False, default=None)
178
-
179
- @property
180
- def client(self) -> "GraphClient":
181
- from office365.graph_client import GraphClient
182
-
183
- if self._client is None:
184
- self._client = GraphClient(self.connector_config.token_factory)
185
- return self._client
186
-
187
- @requires_dependencies(["office365"], extras="onedrive")
188
- def initialize(self):
189
- _ = self.client
190
-
191
- @requires_dependencies(["office365"], extras="onedrive")
192
- def check_connection(self):
193
- try:
194
- token_resp: dict = self.connector_config.token_factory()
195
- if error := token_resp.get("error"):
196
- raise SourceConnectionError(
197
- "{} ({})".format(error, token_resp.get("error_description"))
198
- )
199
- _ = self.client
200
- except Exception as e:
201
- logger.error(f"failed to validate connection: {e}", exc_info=True)
202
- raise SourceConnectionError(f"failed to validate connection: {e}")
203
-
204
- def _list_objects(self, folder, recursive) -> t.List["DriveItem"]:
205
- drive_items = folder.children.get().execute_query()
206
- files = [d for d in drive_items if d.is_file]
207
- if not recursive:
208
- return files
209
- folders = [d for d in drive_items if d.is_folder]
210
- for f in folders:
211
- files += self._list_objects(f, recursive)
212
- return files
213
-
214
- def _gen_ingest_doc(self, file: "DriveItem") -> OneDriveIngestDoc:
215
- file_path = file.parent_reference.path.split(":")[-1]
216
- file_path = file_path[1:] if file_path[0] == "/" else file_path
217
- return OneDriveIngestDoc(
218
- connector_config=self.connector_config,
219
- processor_config=self.processor_config,
220
- read_config=self.read_config,
221
- file_name=file.name,
222
- file_path=file_path,
223
- )
224
-
225
- def get_ingest_docs(self):
226
- root = self.client.users[self.connector_config.user_pname].drive.get().execute_query().root
227
- if fpath := self.connector_config.path:
228
- root = root.get_by_path(fpath).get().execute_query()
229
- if root is None or not root.is_folder:
230
- raise ValueError(f"Unable to find directory, given: {fpath}")
231
- files = self._list_objects(root, self.connector_config.recursive)
232
- return [self._gen_ingest_doc(f) for f in files]
@@ -1,218 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass, field
3
-
4
- from dataclasses_json.core import Json
5
-
6
- from unstructured_ingest.connector.elasticsearch import (
7
- ElasticsearchDestinationConnector,
8
- ElasticsearchDocumentMeta,
9
- ElasticsearchIngestDoc,
10
- ElasticsearchIngestDocBatch,
11
- ElasticsearchSourceConnector,
12
- SimpleElasticsearchConfig,
13
- )
14
- from unstructured_ingest.enhanced_dataclass import enhanced_field
15
- from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
16
- from unstructured_ingest.interfaces import AccessConfig, BaseSingleIngestDoc
17
- from unstructured_ingest.logger import logger
18
- from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
19
- from unstructured_ingest.utils.dep_check import requires_dependencies
20
-
21
- if t.TYPE_CHECKING:
22
- from opensearchpy import OpenSearch
23
-
24
- """Since the actual OpenSearch project is a fork of Elasticsearch, we are relying
25
- heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
26
-
27
-
28
- @dataclass
29
- class OpenSearchAccessConfig(AccessConfig):
30
- hosts: t.Optional[t.List[str]] = None
31
- username: t.Optional[str] = None
32
- password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
33
- use_ssl: bool = False
34
- verify_certs: bool = False
35
- ssl_show_warn: bool = False
36
- ca_certs: t.Optional[str] = None
37
- client_cert: t.Optional[str] = None
38
- client_key: t.Optional[str] = None
39
-
40
- def to_dict(self, **kwargs) -> t.Dict[str, Json]:
41
- d = super().to_dict(**kwargs)
42
- d["http_auth"] = (self.username, self.password)
43
- return d
44
-
45
-
46
- @dataclass
47
- class SimpleOpenSearchConfig(SimpleElasticsearchConfig):
48
- access_config: OpenSearchAccessConfig = None
49
-
50
-
51
- @dataclass
52
- class OpenSearchIngestDoc(ElasticsearchIngestDoc):
53
- """Class encapsulating fetching a doc and writing processed results (but not
54
- doing the processing!).
55
-
56
- Current implementation creates a python OpenSearch client to fetch each doc,
57
- rather than creating a client for each thread.
58
- """
59
-
60
- connector_config: SimpleOpenSearchConfig
61
- registry_name: str = "opensearch"
62
-
63
- @SourceConnectionError.wrap
64
- @requires_dependencies(["opensearchpy"], extras="opensearch")
65
- @BaseSingleIngestDoc.skip_if_file_exists
66
- def get_file(self):
67
- pass
68
-
69
-
70
- @dataclass
71
- class OpenSearchIngestDocBatch(ElasticsearchIngestDocBatch):
72
- connector_config: SimpleOpenSearchConfig
73
- ingest_docs: t.List[OpenSearchIngestDoc] = field(default_factory=list)
74
- registry_name: str = "opensearch_batch"
75
-
76
- @requires_dependencies(["opensearchpy"], extras="opensearch")
77
- def _get_docs(self):
78
- from opensearchpy import OpenSearch
79
- from opensearchpy.helpers import scan
80
-
81
- ops = OpenSearch(**self.connector_config.access_config.to_dict(apply_name_overload=False))
82
- scan_query = {
83
- "_source": self.connector_config.fields,
84
- "version": True,
85
- "query": {"ids": {"values": self.list_of_ids}},
86
- }
87
-
88
- result = scan(
89
- ops,
90
- query=scan_query,
91
- scroll="1m",
92
- index=self.connector_config.index_name,
93
- )
94
- return list(result)
95
-
96
- @SourceConnectionError.wrap
97
- @requires_dependencies(["opensearchpy"], extras="opensearch")
98
- def get_files(self):
99
- documents = self._get_docs()
100
- for doc in documents:
101
- ingest_doc = OpenSearchIngestDoc(
102
- processor_config=self.processor_config,
103
- read_config=self.read_config,
104
- connector_config=self.connector_config,
105
- document=doc,
106
- document_meta=ElasticsearchDocumentMeta(
107
- self.connector_config.index_name, doc["_id"]
108
- ),
109
- )
110
- ingest_doc.update_source_metadata()
111
- doc_body = doc["_source"]
112
- filename = ingest_doc.filename
113
- flattened_dict = flatten_dict(dictionary=doc_body)
114
- str_values = [str(value) for value in flattened_dict.values()]
115
- concatenated_values = "\n".join(str_values)
116
-
117
- filename.parent.mkdir(parents=True, exist_ok=True)
118
- with open(filename, "w", encoding="utf8") as f:
119
- f.write(concatenated_values)
120
- self.ingest_docs.append(ingest_doc)
121
-
122
-
123
- @dataclass
124
- class OpenSearchSourceConnector(ElasticsearchSourceConnector):
125
- """Fetches particular fields from all documents in a given opensearch cluster and index"""
126
-
127
- connector_config: SimpleOpenSearchConfig
128
- _ops: t.Optional["OpenSearch"] = field(init=False, default=None)
129
-
130
- @property
131
- def ops(self):
132
- from opensearchpy import OpenSearch
133
-
134
- if self._ops is None:
135
- self._ops = OpenSearch(
136
- **self.connector_config.access_config.to_dict(apply_name_overload=False)
137
- )
138
- return self._ops
139
-
140
- def check_connection(self):
141
- try:
142
- assert self.ops.ping()
143
- except Exception as e:
144
- logger.error(f"failed to validate connection: {e}", exc_info=True)
145
- raise SourceConnectionError(f"failed to validate connection: {e}")
146
-
147
- @requires_dependencies(["opensearchpy"], extras="opensearch")
148
- def _get_doc_ids(self):
149
- """Fetches all document ids in an index"""
150
- from opensearchpy.helpers import scan
151
-
152
- hits = scan(
153
- self.ops,
154
- query=self.scan_query,
155
- scroll="1m",
156
- index=self.connector_config.index_name,
157
- )
158
-
159
- return [hit["_id"] for hit in hits]
160
-
161
- def get_ingest_docs(self):
162
- """Fetches all documents in an index, using ids that are fetched with _get_doc_ids"""
163
- ids = self._get_doc_ids()
164
- id_batches = [
165
- ids[
166
- i
167
- * self.connector_config.batch_size : (i + 1) # noqa
168
- * self.connector_config.batch_size
169
- ]
170
- for i in range(
171
- (len(ids) + self.connector_config.batch_size - 1)
172
- // self.connector_config.batch_size
173
- )
174
- ]
175
- return [
176
- OpenSearchIngestDocBatch(
177
- connector_config=self.connector_config,
178
- processor_config=self.processor_config,
179
- read_config=self.read_config,
180
- list_of_ids=batched_ids,
181
- )
182
- for batched_ids in id_batches
183
- ]
184
-
185
-
186
- @dataclass
187
- class OpenSearchDestinationConnector(ElasticsearchDestinationConnector):
188
- connector_config: SimpleOpenSearchConfig
189
- _client: t.Optional["OpenSearch"] = field(init=False, default=None)
190
-
191
- @DestinationConnectionError.wrap
192
- @requires_dependencies(["opensearchpy"], extras="opensearch")
193
- def generate_client(self) -> "OpenSearch":
194
- from opensearchpy import OpenSearch
195
-
196
- return OpenSearch(**self.connector_config.access_config.to_dict(apply_name_overload=False))
197
-
198
- @requires_dependencies(["opensearchpy"], extras="opensearch")
199
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]]) -> None:
200
- logger.info(
201
- f"writing document batches to destination"
202
- f" index named {self.connector_config.index_name}"
203
- f" at {self.connector_config.access_config.hosts}"
204
- f" with batch size (in bytes) {self.write_config.batch_size_bytes}"
205
- f" with {self.write_config.num_processes} (number of) processes"
206
- )
207
- from opensearchpy.helpers import parallel_bulk
208
-
209
- for batch in generator_batching_wbytes(
210
- elements_dict, batch_size_limit_bytes=self.write_config.batch_size_bytes
211
- ):
212
- for success, info in parallel_bulk(
213
- self.client, batch, thread_count=self.write_config.num_processes
214
- ):
215
- if not success:
216
- logger.error(
217
- "upload failed for a batch in opensearch destination connector:", info
218
- )