unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +49 -0
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/processes/connectors/github.py +221 -0
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,180 +0,0 @@
1
- import datetime as dt
2
- import typing as t
3
- from dataclasses import dataclass
4
- from pathlib import Path
5
-
6
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
8
- from unstructured_ingest.interfaces import (
9
- AccessConfig,
10
- BaseConnectorConfig,
11
- BaseSingleIngestDoc,
12
- BaseSourceConnector,
13
- IngestDocCleanupMixin,
14
- SourceConnectorCleanupMixin,
15
- SourceMetadata,
16
- )
17
- from unstructured_ingest.logger import logger
18
- from unstructured_ingest.utils.dep_check import (
19
- requires_dependencies,
20
- )
21
-
22
-
23
- @dataclass
24
- class DiscordAccessConfig(AccessConfig):
25
- token: str = enhanced_field(sensitive=True)
26
-
27
-
28
- @dataclass
29
- class SimpleDiscordConfig(BaseConnectorConfig):
30
- """Connector config where channels is a comma separated list of
31
- Discord channels to pull messages from.
32
- """
33
-
34
- # Discord Specific Options
35
- access_config: DiscordAccessConfig
36
- channels: t.List[str]
37
- period: t.Optional[int] = None
38
-
39
-
40
- @dataclass
41
- class DiscordIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
42
- """Class encapsulating fetching a doc and writing processed results (but not
43
- doing the processing!).
44
- Also includes a cleanup method. When things go wrong and the cleanup
45
- method is not called, the file is left behind on the filesystem to assist debugging.
46
- """
47
-
48
- connector_config: SimpleDiscordConfig
49
- channel: str
50
- days: t.Optional[int] = None
51
- registry_name: str = "discord"
52
-
53
- # NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file
54
- # __post_init__ for multiprocessing simplicity (no Path objects in initially
55
- # instantiated object)
56
- def _tmp_download_file(self):
57
- channel_file = self.channel + ".txt"
58
- return Path(self.read_config.download_dir) / channel_file
59
-
60
- @property
61
- def _output_filename(self):
62
- output_file = self.channel + ".json"
63
- return Path(self.processor_config.output_dir) / output_file
64
-
65
- def _create_full_tmp_dir_path(self):
66
- self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
67
-
68
- @SourceConnectionNetworkError.wrap
69
- @requires_dependencies(dependencies=["discord"], extras="discord")
70
- def _get_messages(self):
71
- """Actually fetches the data from discord."""
72
- import discord
73
- from discord.ext import commands
74
-
75
- messages: t.List[discord.Message] = []
76
- jumpurl: t.List[str] = []
77
- intents = discord.Intents.default()
78
- intents.message_content = True
79
- bot = commands.Bot(command_prefix=">", intents=intents)
80
-
81
- @bot.event
82
- async def on_ready():
83
- try:
84
- after_date = None
85
- if self.days:
86
- after_date = dt.datetime.utcnow() - dt.timedelta(days=self.days)
87
- channel = bot.get_channel(int(self.channel))
88
- jumpurl.append(channel.jump_url) # type: ignore
89
- async for msg in channel.history(after=after_date): # type: ignore
90
- messages.append(msg)
91
- await bot.close()
92
- except Exception:
93
- logger.error("Error fetching messages")
94
- await bot.close()
95
- raise
96
-
97
- bot.run(self.connector_config.access_config.token)
98
- jump_url = None if len(jumpurl) < 1 else jumpurl[0]
99
- return messages, jump_url
100
-
101
- def update_source_metadata(self, **kwargs):
102
- messages, jump_url = kwargs.get("messages_tuple", self._get_messages())
103
- if messages == []:
104
- self.source_metadata = SourceMetadata(
105
- exists=False,
106
- )
107
- return
108
- dates = [m.created_at for m in messages if m.created_at]
109
- dates.sort()
110
- self.source_metadata = SourceMetadata(
111
- date_created=dates[0].isoformat(),
112
- date_modified=dates[-1].isoformat(),
113
- source_url=jump_url,
114
- exists=True,
115
- )
116
-
117
- @SourceConnectionError.wrap
118
- @BaseSingleIngestDoc.skip_if_file_exists
119
- def get_file(self):
120
- self._create_full_tmp_dir_path()
121
-
122
- messages, jump_url = self._get_messages()
123
- self.update_source_metadata(messages_tuple=(messages, jump_url))
124
- if messages == []:
125
- raise ValueError(f"Failed to retrieve messages from Discord channel {self.channel}")
126
- self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
127
- with open(self._tmp_download_file(), "w") as f:
128
- for m in messages:
129
- f.write(m.content + "\n")
130
-
131
- @property
132
- def filename(self):
133
- """The filename of the file created from a discord channel"""
134
- return self._tmp_download_file()
135
-
136
- @property
137
- def version(self) -> t.Optional[str]:
138
- return None
139
-
140
- @property
141
- def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
142
- return {
143
- "channel": self.channel,
144
- }
145
-
146
-
147
- class DiscordSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
148
- """Objects of this class support fetching document(s) from"""
149
-
150
- connector_config: SimpleDiscordConfig
151
-
152
- def initialize(self):
153
- pass
154
-
155
- @requires_dependencies(dependencies=["discord"], extras="discord")
156
- def check_connection(self):
157
- import asyncio
158
-
159
- import discord
160
- from discord.client import Client
161
-
162
- intents = discord.Intents.default()
163
- try:
164
- client = Client(intents=intents)
165
- asyncio.run(client.start(token=self.connector_config.access_config.token))
166
- except Exception as e:
167
- logger.error(f"failed to validate connection: {e}", exc_info=True)
168
- raise SourceConnectionError(f"failed to validate connection: {e}")
169
-
170
- def get_ingest_docs(self):
171
- return [
172
- DiscordIngestDoc(
173
- connector_config=self.connector_config,
174
- processor_config=self.processor_config,
175
- read_config=self.read_config,
176
- channel=channel,
177
- days=self.connector_config.period,
178
- )
179
- for channel in self.connector_config.channels
180
- ]
@@ -1,396 +0,0 @@
1
- import copy
2
- import hashlib
3
- import typing as t
4
- import uuid
5
- from dataclasses import dataclass, field
6
- from pathlib import Path
7
-
8
- from dataclasses_json.core import Json
9
-
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
- from unstructured_ingest.enhanced_dataclass.core import _asdict
12
- from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
13
- from unstructured_ingest.interfaces import (
14
- AccessConfig,
15
- BaseConnectorConfig,
16
- BaseDestinationConnector,
17
- BaseIngestDocBatch,
18
- BaseSingleIngestDoc,
19
- BaseSourceConnector,
20
- IngestDocCleanupMixin,
21
- SourceConnectorCleanupMixin,
22
- SourceMetadata,
23
- WriteConfig,
24
- )
25
- from unstructured_ingest.logger import logger
26
- from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
27
- from unstructured_ingest.utils.dep_check import requires_dependencies
28
-
29
- if t.TYPE_CHECKING:
30
- from elasticsearch import Elasticsearch
31
-
32
-
33
- @dataclass
34
- class ElasticsearchAccessConfig(AccessConfig):
35
- hosts: t.Optional[t.List[str]] = None
36
- username: t.Optional[str] = None
37
- password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
38
- cloud_id: t.Optional[str] = None
39
- api_key: t.Optional[str] = enhanced_field(
40
- default=None, sensitive=True, overload_name="es_api_key"
41
- )
42
- api_key_id: t.Optional[str] = None
43
- bearer_auth: t.Optional[str] = enhanced_field(default=None, sensitive=True)
44
- ca_certs: t.Optional[str] = None
45
- ssl_assert_fingerprint: t.Optional[str] = enhanced_field(default=None, sensitive=True)
46
-
47
- def to_dict(self, **kwargs) -> t.Dict[str, Json]:
48
- d = super().to_dict(**kwargs)
49
- # Update auth related fields to conform to what the SDK expects based on the
50
- # supported methods:
51
- # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
52
- if not self.ca_certs:
53
- # ES library already sets a default for this, don't want to
54
- # introduce data by setting it to None
55
- d.pop("ca_certs")
56
- if self.password and (self.cloud_id or self.ca_certs or self.ssl_assert_fingerprint):
57
- d.pop("password")
58
- d["basic_auth"] = ("elastic", self.password)
59
- elif not self.cloud_id and self.username and self.password:
60
- d.pop("username", None)
61
- d.pop("password", None)
62
- d["basic_auth"] = (self.username, self.password)
63
- elif self.api_key and self.api_key_id:
64
- d.pop("api_key_id", None)
65
- d.pop("api_key", None)
66
- d["api_key"] = (self.api_key_id, self.api_key)
67
- # This doesn't exist on the client init, remove:
68
- d.pop("api_key_id", None)
69
- return d
70
-
71
-
72
- @dataclass
73
- class SimpleElasticsearchConfig(BaseConnectorConfig):
74
- """Connector config where:
75
- url is the url to access the elasticsearch server,
76
- index_name is the name of the index to reach to,
77
- """
78
-
79
- index_name: str
80
- batch_size: int = 100
81
- fields: t.List[str] = field(default_factory=list)
82
- access_config: ElasticsearchAccessConfig = None
83
-
84
-
85
- @dataclass
86
- class ElasticsearchDocumentMeta:
87
- """Metadata specifying:
88
- name of the elasticsearch index that is being reached to,
89
- and the id of document that is being reached to,
90
- """
91
-
92
- index_name: str
93
- document_id: str
94
-
95
-
96
- @dataclass
97
- class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
98
- """Class encapsulating fetching a doc and writing processed results (but not
99
- doing the processing!).
100
-
101
- Current implementation creates a python Elasticsearch client to fetch each doc,
102
- rather than creating a client for each thread.
103
- """
104
-
105
- connector_config: SimpleElasticsearchConfig
106
- document_meta: ElasticsearchDocumentMeta
107
- document: dict = field(default_factory=dict)
108
- registry_name: str = "elasticsearch"
109
-
110
- # TODO: remove one of filename or _tmp_download_file, using a wrapper
111
- @property
112
- def filename(self):
113
- f = self.document_meta.document_id
114
- if self.connector_config.fields:
115
- f = "{}-{}".format(
116
- f,
117
- hashlib.sha256(",".join(self.connector_config.fields).encode()).hexdigest()[:8],
118
- )
119
- return (
120
- Path(self.read_config.download_dir) / self.document_meta.index_name / f"{f}.txt"
121
- ).resolve()
122
-
123
- @property
124
- def _output_filename(self):
125
- """Create filename document id combined with a hash of the query to uniquely identify
126
- the output file."""
127
- # Generate SHA256 hash and take the first 8 characters
128
- filename = self.document_meta.document_id
129
- if self.connector_config.fields:
130
- filename = "{}-{}".format(
131
- filename,
132
- hashlib.sha256(",".join(self.connector_config.fields).encode()).hexdigest()[:8],
133
- )
134
- output_file = f"{filename}.json"
135
- return (
136
- Path(self.processor_config.output_dir) / self.connector_config.index_name / output_file
137
- )
138
-
139
- def update_source_metadata(self, **kwargs):
140
- if self.document is None:
141
- self.source_metadata = SourceMetadata(
142
- exists=False,
143
- )
144
- return
145
- self.source_metadata = SourceMetadata(
146
- version=self.document["_version"],
147
- exists=True,
148
- )
149
-
150
- @SourceConnectionError.wrap
151
- @requires_dependencies(["elasticsearch"], extras="elasticsearch")
152
- @BaseSingleIngestDoc.skip_if_file_exists
153
- def get_file(self):
154
- pass
155
-
156
- @property
157
- def date_created(self) -> t.Optional[str]:
158
- return None
159
-
160
- @property
161
- def date_modified(self) -> t.Optional[str]:
162
- return None
163
-
164
- @property
165
- def source_url(self) -> t.Optional[str]:
166
- return None
167
-
168
- @property
169
- def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
170
- return {
171
- "hosts": self.connector_config.access_config.hosts,
172
- "index_name": self.connector_config.index_name,
173
- "document_id": self.document_meta.document_id,
174
- }
175
-
176
-
177
- @dataclass
178
- class ElasticsearchIngestDocBatch(BaseIngestDocBatch):
179
- connector_config: SimpleElasticsearchConfig
180
- ingest_docs: t.List[ElasticsearchIngestDoc] = field(default_factory=list)
181
- list_of_ids: t.List[str] = field(default_factory=list)
182
- registry_name: str = "elasticsearch_batch"
183
-
184
- def __post_init__(self):
185
- # Until python3.8 is deprecated, this is a limitation of dataclass inheritance
186
- # to make it a required field
187
- if len(self.list_of_ids) == 0:
188
- raise ValueError("list_of_ids is required")
189
-
190
- @property
191
- def unique_id(self) -> str:
192
- return ",".join(sorted(self.list_of_ids))
193
-
194
- @requires_dependencies(["elasticsearch"], extras="elasticsearch")
195
- def _get_docs(self):
196
- from elasticsearch import Elasticsearch
197
- from elasticsearch.helpers import scan
198
-
199
- es = Elasticsearch(**self.connector_config.access_config.to_dict(apply_name_overload=False))
200
- scan_query = {
201
- "_source": self.connector_config.fields,
202
- "version": True,
203
- "query": {"ids": {"values": self.list_of_ids}},
204
- }
205
-
206
- result = scan(
207
- es,
208
- query=scan_query,
209
- scroll="1m",
210
- index=self.connector_config.index_name,
211
- )
212
- return list(result)
213
-
214
- @SourceConnectionError.wrap
215
- @requires_dependencies(["elasticsearch"], extras="elasticsearch")
216
- def get_files(self):
217
- documents = self._get_docs()
218
- for doc in documents:
219
- ingest_doc = ElasticsearchIngestDoc(
220
- processor_config=self.processor_config,
221
- read_config=self.read_config,
222
- connector_config=self.connector_config,
223
- document=doc,
224
- document_meta=ElasticsearchDocumentMeta(
225
- self.connector_config.index_name, doc["_id"]
226
- ),
227
- )
228
- ingest_doc.update_source_metadata()
229
- doc_body = doc["_source"]
230
- filename = ingest_doc.filename
231
- flattened_dict = flatten_dict(dictionary=doc_body)
232
- str_values = [str(value) for value in flattened_dict.values()]
233
- concatenated_values = "\n".join(str_values)
234
-
235
- filename.parent.mkdir(parents=True, exist_ok=True)
236
- with open(filename, "w", encoding="utf8") as f:
237
- f.write(concatenated_values)
238
- self.ingest_docs.append(ingest_doc)
239
-
240
-
241
- @dataclass
242
- class ElasticsearchSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
243
- """Fetches particular fields from all documents in a given elasticsearch cluster and index"""
244
-
245
- connector_config: SimpleElasticsearchConfig
246
- _es: t.Optional["Elasticsearch"] = field(init=False, default=None)
247
-
248
- @property
249
- def es(self):
250
- from elasticsearch import Elasticsearch
251
-
252
- if self._es is None:
253
- self._es = Elasticsearch(
254
- **self.connector_config.access_config.to_dict(apply_name_overload=False)
255
- )
256
- return self._es
257
-
258
- def check_connection(self):
259
- try:
260
- self.es.perform_request("HEAD", "/", headers={"accept": "application/json"})
261
- except Exception as e:
262
- logger.error(f"failed to validate connection: {e}", exc_info=True)
263
- raise SourceConnectionError(f"failed to validate connection: {e}")
264
-
265
- def __post_init__(self):
266
- self.scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
267
-
268
- def initialize(self):
269
- pass
270
-
271
- @requires_dependencies(["elasticsearch"], extras="elasticsearch")
272
- def _get_doc_ids(self):
273
- """Fetches all document ids in an index"""
274
- from elasticsearch.helpers import scan
275
-
276
- hits = scan(
277
- self.es,
278
- query=self.scan_query,
279
- scroll="1m",
280
- index=self.connector_config.index_name,
281
- )
282
-
283
- return [hit["_id"] for hit in hits]
284
-
285
- def get_ingest_docs(self):
286
- """Fetches all documents in an index, using ids that are fetched with _get_doc_ids"""
287
- ids = self._get_doc_ids()
288
- id_batches = [
289
- ids[
290
- i
291
- * self.connector_config.batch_size : (i + 1) # noqa
292
- * self.connector_config.batch_size
293
- ]
294
- for i in range(
295
- (len(ids) + self.connector_config.batch_size - 1)
296
- // self.connector_config.batch_size
297
- )
298
- ]
299
- return [
300
- ElasticsearchIngestDocBatch(
301
- connector_config=self.connector_config,
302
- processor_config=self.processor_config,
303
- read_config=self.read_config,
304
- list_of_ids=batched_ids,
305
- )
306
- for batched_ids in id_batches
307
- ]
308
-
309
-
310
- @dataclass
311
- class ElasticsearchWriteConfig(WriteConfig):
312
- batch_size_bytes: int = 15_000_000
313
- num_processes: int = 1
314
-
315
-
316
- @dataclass
317
- class ElasticsearchDestinationConnector(BaseDestinationConnector):
318
- write_config: ElasticsearchWriteConfig
319
- connector_config: SimpleElasticsearchConfig
320
- _client: t.Optional["Elasticsearch"] = field(init=False, default=None)
321
-
322
- def to_dict(self, **kwargs):
323
- """
324
- The _client variable in this dataclass breaks deepcopy due to:
325
- TypeError: cannot pickle '_thread.lock' object
326
- When serializing, remove it, meaning client data will need to be reinitialized
327
- when deserialized
328
- """
329
- self_cp = copy.copy(self)
330
- if hasattr(self_cp, "_client"):
331
- setattr(self_cp, "_client", None)
332
- return _asdict(self_cp, **kwargs)
333
-
334
- @DestinationConnectionError.wrap
335
- @requires_dependencies(["elasticsearch"], extras="elasticsearch")
336
- def generate_client(self) -> "Elasticsearch":
337
- from elasticsearch import Elasticsearch
338
-
339
- return Elasticsearch(
340
- **self.connector_config.access_config.to_dict(apply_name_overload=False)
341
- )
342
-
343
- @property
344
- def client(self):
345
- if self._client is None:
346
- self._client = self.generate_client()
347
- return self._client
348
-
349
- def initialize(self):
350
- _ = self.client
351
-
352
- @DestinationConnectionError.wrap
353
- def check_connection(self):
354
- try:
355
- assert self.client.ping()
356
- except Exception as e:
357
- logger.error(f"failed to validate connection: {e}", exc_info=True)
358
- raise DestinationConnectionError(f"failed to validate connection: {e}")
359
-
360
- @requires_dependencies(["elasticsearch"], extras="elasticsearch")
361
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
362
- logger.info(
363
- f"writing document batches to destination"
364
- f" index named {self.connector_config.index_name}"
365
- f" at {self.connector_config.access_config.hosts}"
366
- f" with batch size (in bytes) {self.write_config.batch_size_bytes}"
367
- f" with {self.write_config.num_processes} (number of) processes"
368
- )
369
- from elasticsearch.helpers import parallel_bulk
370
-
371
- for batch in generator_batching_wbytes(
372
- elements_dict, batch_size_limit_bytes=self.write_config.batch_size_bytes
373
- ):
374
- for success, info in parallel_bulk(
375
- self.client, batch, thread_count=self.write_config.num_processes
376
- ):
377
- if not success:
378
- logger.error(
379
- "upload failed for a batch in elasticsearch destination connector:", info
380
- )
381
-
382
- def normalize_dict(self, element_dict: dict) -> dict:
383
- return {
384
- "_index": self.connector_config.index_name,
385
- "_id": str(uuid.uuid4()),
386
- "_source": {
387
- "element_id": element_dict.pop("element_id", None),
388
- "embeddings": element_dict.pop("embeddings", None),
389
- "text": element_dict.pop("text", None),
390
- "type": element_dict.pop("type", None),
391
- "metadata": flatten_dict(
392
- element_dict.pop("metadata", None),
393
- separator="-",
394
- ),
395
- },
396
- }
@@ -1,78 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.connector.fsspec.fsspec import (
5
- FsspecDestinationConnector,
6
- FsspecIngestDoc,
7
- FsspecSourceConnector,
8
- FsspecWriteConfig,
9
- SimpleFsspecConfig,
10
- WriteTextConfig,
11
- )
12
- from unstructured_ingest.enhanced_dataclass import enhanced_field
13
- from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
14
- from unstructured_ingest.interfaces import AccessConfig
15
- from unstructured_ingest.logger import logger
16
- from unstructured_ingest.utils.dep_check import requires_dependencies
17
-
18
-
19
- @dataclass
20
- class AzureWriteTextConfig(WriteTextConfig):
21
- overwrite: bool = False
22
-
23
-
24
- @dataclass
25
- class AzureWriteConfig(FsspecWriteConfig):
26
- write_text_config: t.Optional[AzureWriteTextConfig] = None
27
-
28
-
29
- @dataclass
30
- class AzureAccessConfig(AccessConfig):
31
- account_name: t.Optional[str] = enhanced_field(default=None, sensitive=True)
32
- account_key: t.Optional[str] = enhanced_field(default=None, sensitive=True)
33
- connection_string: t.Optional[str] = enhanced_field(default=None, sensitive=True)
34
- sas_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
35
-
36
-
37
- @dataclass
38
- class SimpleAzureBlobStorageConfig(SimpleFsspecConfig):
39
- access_config: AzureAccessConfig = None
40
-
41
-
42
- @dataclass
43
- class AzureBlobStorageIngestDoc(FsspecIngestDoc):
44
- connector_config: SimpleAzureBlobStorageConfig
45
- registry_name: str = "azure"
46
-
47
- @SourceConnectionError.wrap
48
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
49
- def get_file(self):
50
- super().get_file()
51
-
52
-
53
- @dataclass
54
- class AzureBlobStorageSourceConnector(FsspecSourceConnector):
55
- connector_config: SimpleAzureBlobStorageConfig
56
-
57
- def __post_init__(self):
58
- self.ingest_doc_cls: t.Type[AzureBlobStorageIngestDoc] = AzureBlobStorageIngestDoc
59
-
60
-
61
- @dataclass
62
- class AzureBlobStorageDestinationConnector(FsspecDestinationConnector):
63
- connector_config: SimpleAzureBlobStorageConfig
64
- write_config: AzureWriteConfig
65
-
66
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
67
- def initialize(self):
68
- super().initialize()
69
-
70
- @requires_dependencies(["adlfs"], extras="azure")
71
- def check_connection(self):
72
- from adlfs import AzureBlobFileSystem
73
-
74
- try:
75
- AzureBlobFileSystem(**self.connector_config.get_access_config())
76
- except ValueError as connection_error:
77
- logger.error(f"failed to validate connection: {connection_error}", exc_info=True)
78
- raise DestinationConnectionError(f"failed to validate connection: {connection_error}")