unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
  279. unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,224 +0,0 @@
1
- import typing as t
2
- import xml.etree.ElementTree as ET
3
- from dataclasses import dataclass
4
- from datetime import datetime
5
- from pathlib import Path
6
-
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
9
- from unstructured_ingest.interfaces import (
10
- AccessConfig,
11
- BaseConnectorConfig,
12
- BaseSingleIngestDoc,
13
- BaseSourceConnector,
14
- IngestDocCleanupMixin,
15
- SourceConnectorCleanupMixin,
16
- SourceMetadata,
17
- )
18
- from unstructured_ingest.logger import logger
19
- from unstructured_ingest.utils.data_prep import validate_date_args
20
- from unstructured_ingest.utils.dep_check import (
21
- requires_dependencies,
22
- )
23
-
24
- DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
25
-
26
-
27
- @dataclass
28
- class SlackAccessConfig(AccessConfig):
29
- token: str = enhanced_field(sensitive=True)
30
-
31
-
32
- @dataclass
33
- class SimpleSlackConfig(BaseConnectorConfig):
34
- """Connector config to process all messages by channel id's."""
35
-
36
- access_config: SlackAccessConfig
37
- channels: t.List[str]
38
- start_date: t.Optional[str] = None
39
- end_date: t.Optional[str] = None
40
-
41
- def validate_inputs(self):
42
- oldest_valid = True
43
- latest_valid = True
44
-
45
- if self.start_date:
46
- oldest_valid = validate_date_args(self.start_date)
47
-
48
- if self.end_date:
49
- latest_valid = validate_date_args(self.end_date)
50
-
51
- return oldest_valid, latest_valid
52
-
53
- def __post_init__(self):
54
- oldest_valid, latest_valid = self.validate_inputs()
55
- if not oldest_valid and not latest_valid:
56
- raise ValueError(
57
- "Start and/or End dates are not valid. ",
58
- )
59
-
60
-
61
- @dataclass
62
- class SlackIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
63
- """Class encapsulating fetching a doc and writing processed results (but not
64
- doing the processing!).
65
-
66
- Also includes a cleanup method. When things go wrong and the cleanup
67
- method is not called, the file is left behind on the filesystem to assist debugging.
68
- """
69
-
70
- connector_config: SimpleSlackConfig
71
- channel: str
72
- registry_name: str = "slack"
73
-
74
- # NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file
75
- # __post_init__ for multiprocessing simplicity (no Path objects in initially
76
- # instantiated object)
77
- def _tmp_download_file(self):
78
- channel_file = self.channel + ".xml"
79
- return Path(self.read_config.download_dir) / channel_file
80
-
81
- @property
82
- def _output_filename(self):
83
- output_file = self.channel + ".json"
84
- return Path(self.processor_config.output_dir) / output_file
85
-
86
- @property
87
- def version(self) -> t.Optional[str]:
88
- return None
89
-
90
- @property
91
- def source_url(self) -> t.Optional[str]:
92
- return None
93
-
94
- def _create_full_tmp_dir_path(self):
95
- self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
96
-
97
- @SourceConnectionNetworkError.wrap
98
- @requires_dependencies(dependencies=["slack_sdk"], extras="slack")
99
- def _fetch_messages(self):
100
- from slack_sdk import WebClient
101
-
102
- self.client = WebClient(token=self.connector_config.access_config.token)
103
- oldest = "0"
104
- latest = "0"
105
- if self.connector_config.start_date:
106
- oldest = self.convert_datetime(self.connector_config.start_date)
107
-
108
- if self.connector_config.end_date:
109
- latest = self.convert_datetime(self.connector_config.end_date)
110
-
111
- result = self.client.conversations_history(
112
- channel=self.channel,
113
- oldest=oldest,
114
- latest=latest,
115
- )
116
- return result
117
-
118
- def update_source_metadata(self, **kwargs):
119
- result = kwargs.get("result", self._fetch_messages())
120
- if result is None:
121
- self.source_metadata = SourceMetadata(
122
- exists=True,
123
- )
124
- return
125
- timestamps = [m["ts"] for m in result["messages"]]
126
- timestamps.sort()
127
- date_created = None
128
- date_modified = None
129
- if len(timestamps) > 0:
130
- date_created = datetime.fromtimestamp(float(timestamps[0])).isoformat()
131
- date_modified = datetime.fromtimestamp(
132
- float(timestamps[len(timestamps) - 1]),
133
- ).isoformat()
134
-
135
- self.source_metadata = SourceMetadata(
136
- date_created=date_created,
137
- date_modified=date_modified,
138
- exists=True,
139
- )
140
-
141
- @SourceConnectionError.wrap
142
- @BaseSingleIngestDoc.skip_if_file_exists
143
- @requires_dependencies(dependencies=["slack_sdk"], extras="slack")
144
- def get_file(self):
145
- from slack_sdk.errors import SlackApiError
146
-
147
- """Fetches the data from a slack channel and stores it locally."""
148
-
149
- self._create_full_tmp_dir_path()
150
-
151
- result = self._fetch_messages()
152
- self.update_source_metadata(result=result)
153
- root = ET.Element("messages")
154
- for message in result["messages"]:
155
- message_elem = ET.SubElement(root, "message")
156
- text_elem = ET.SubElement(message_elem, "text")
157
- text_elem.text = message.get("text")
158
-
159
- cursor = None
160
- while True:
161
- try:
162
- response = self.client.conversations_replies(
163
- channel=self.channel,
164
- ts=message["ts"],
165
- cursor=cursor,
166
- )
167
-
168
- for reply in response["messages"]:
169
- reply_msg = reply.get("text")
170
- text_elem.text = "".join([str(text_elem.text), " <reply> ", reply_msg])
171
-
172
- if not response["has_more"]:
173
- break
174
-
175
- cursor = response["response_metadata"]["next_cursor"]
176
-
177
- except SlackApiError as e:
178
- logger.error(f"Error retrieving replies: {e.response['error']}")
179
- tree = ET.ElementTree(root)
180
- tree.write(self._tmp_download_file(), encoding="utf-8", xml_declaration=True)
181
-
182
- def convert_datetime(self, date_time):
183
- for format in DATE_FORMATS:
184
- try:
185
- return datetime.strptime(date_time, format).timestamp()
186
- except ValueError:
187
- pass
188
-
189
- @property
190
- def filename(self):
191
- """The filename of the file created from a slack channel"""
192
- return self._tmp_download_file()
193
-
194
-
195
- class SlackSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
196
- """Objects of this class support fetching document(s) from"""
197
-
198
- connector_config: SimpleSlackConfig
199
-
200
- @requires_dependencies(dependencies=["slack_sdk"], extras="slack")
201
- def check_connection(self):
202
- from slack_sdk import WebClient
203
- from slack_sdk.errors import SlackClientError
204
-
205
- try:
206
- client = WebClient(token=self.connector_config.access_config.token)
207
- client.users_identity()
208
- except SlackClientError as slack_error:
209
- logger.error(f"failed to validate connection: {slack_error}", exc_info=True)
210
- raise SourceConnectionError(f"failed to validate connection: {slack_error}")
211
-
212
- def initialize(self):
213
- """Verify that can get metadata for an object, validates connections info."""
214
-
215
- def get_ingest_docs(self):
216
- return [
217
- SlackIngestDoc(
218
- connector_config=self.connector_config,
219
- processor_config=self.processor_config,
220
- read_config=self.read_config,
221
- channel=channel,
222
- )
223
- for channel in self.connector_config.channels
224
- ]
@@ -1,199 +0,0 @@
1
- import copy
2
- import json
3
- import typing as t
4
- import uuid
5
- from dataclasses import dataclass, field
6
-
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
- from unstructured_ingest.enhanced_dataclass.core import _asdict
9
- from unstructured_ingest.error import DestinationConnectionError
10
- from unstructured_ingest.interfaces import (
11
- AccessConfig,
12
- BaseConnectorConfig,
13
- BaseDestinationConnector,
14
- )
15
- from unstructured_ingest.logger import logger
16
- from unstructured_ingest.utils.dep_check import requires_dependencies
17
-
18
- ELEMENTS_TABLE_NAME = "elements"
19
-
20
-
21
- @dataclass
22
- class SqlAccessConfig(AccessConfig):
23
- username: t.Optional[str]
24
- password: t.Optional[str] = enhanced_field(sensitive=True)
25
-
26
-
27
- @dataclass
28
- class SimpleSqlConfig(BaseConnectorConfig):
29
- db_type: t.Optional[str]
30
- host: t.Optional[str]
31
- database: t.Optional[str]
32
- port: t.Optional[int]
33
- access_config: SqlAccessConfig
34
-
35
- def __post_init__(self):
36
- if (self.db_type == "sqlite") and (self.database is None):
37
- raise ValueError(
38
- "A sqlite connection requires a path to a *.db file "
39
- "through the `database` argument"
40
- )
41
-
42
- @property
43
- def connection(self):
44
- if self.db_type == "postgresql":
45
- return self._make_psycopg_connection
46
- elif self.db_type == "sqlite":
47
- return self._make_sqlite_connection
48
- raise ValueError(f"Unsupported database {self.db_type} connection.")
49
-
50
- def _make_sqlite_connection(self):
51
- from sqlite3 import connect
52
-
53
- return connect(database=self.database)
54
-
55
- @requires_dependencies(["psycopg2"], extras="postgres")
56
- def _make_psycopg_connection(self):
57
- from psycopg2 import connect
58
-
59
- return connect(
60
- user=self.access_config.username,
61
- password=self.access_config.password,
62
- dbname=self.database,
63
- host=self.host,
64
- port=self.port,
65
- )
66
-
67
-
68
- @dataclass
69
- class SqlDestinationConnector(BaseDestinationConnector):
70
- connector_config: SimpleSqlConfig
71
- _client: t.Optional[t.Any] = field(init=False, default=None)
72
-
73
- def to_dict(self, **kwargs):
74
- """
75
- The _client variable in this dataclass breaks deepcopy due to:
76
- TypeError: cannot pickle '_thread.lock' object
77
- When serializing, remove it, meaning client data will need to be reinitialized
78
- when deserialized
79
- """
80
- self_cp = copy.copy(self)
81
- if hasattr(self_cp, "_client"):
82
- setattr(self_cp, "_client", None)
83
- return _asdict(self_cp, **kwargs)
84
-
85
- @property
86
- def client(self):
87
- if self._client is None:
88
- self._client = self.connector_config.connection()
89
- return self._client
90
-
91
- @DestinationConnectionError.wrap
92
- def initialize(self):
93
- _ = self.client
94
-
95
- def check_connection(self):
96
- try:
97
- cursor = self.client.cursor()
98
- cursor.execute("SELECT 1;")
99
- cursor.close()
100
- except Exception as e:
101
- logger.error(f"failed to validate connection: {e}", exc_info=True)
102
- raise DestinationConnectionError(f"failed to validate connection: {e}")
103
-
104
- def conform_dict(self, data: dict) -> None:
105
- """
106
- Updates the element dictionary to conform to the sql schema
107
- """
108
- from datetime import datetime
109
-
110
- data["id"] = str(uuid.uuid4())
111
-
112
- # Dict as string formatting
113
- if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
114
- # Explicit casting otherwise fails schema type checking
115
- data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
116
-
117
- # Array of items as string formatting
118
- if (embeddings := data.get("embeddings")) and (
119
- self.connector_config.db_type != "postgresql"
120
- ):
121
- data["embeddings"] = str(json.dumps(embeddings))
122
-
123
- if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
124
- data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
125
-
126
- if links := data.get("metadata", {}).get("links", {}):
127
- data["metadata"]["links"] = str(json.dumps(links))
128
-
129
- if permissions_data := (
130
- data.get("metadata", {}).get("data_source", {}).get("permissions_data")
131
- ):
132
- data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
133
-
134
- if sent_from := data.get("metadata", {}).get("sent_from", {}):
135
- data["metadata"]["sent_from"] = str(json.dumps(sent_from))
136
-
137
- if sent_to := data.get("metadata", {}).get("sent_to", {}):
138
- data["metadata"]["sent_to"] = str(json.dumps(sent_to))
139
-
140
- # Datetime formatting
141
- if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
142
- data["metadata"]["data_source"]["date_created"] = datetime.fromisoformat(date_created)
143
-
144
- if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
145
- data["metadata"]["data_source"]["date_modified"] = datetime.fromisoformat(date_modified)
146
-
147
- if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
148
- data["metadata"]["data_source"]["date_processed"] = datetime.fromisoformat(
149
- date_processed
150
- )
151
-
152
- if last_modified := data.get("metadata", {}).get("last_modified", {}):
153
- data["metadata"]["last_modified"] = datetime.fromisoformat(last_modified)
154
-
155
- # String casting
156
- if version := data.get("metadata", {}).get("data_source", {}).get("version"):
157
- data["metadata"]["data_source"]["version"] = str(version)
158
-
159
- if page_number := data.get("metadata", {}).get("page_number"):
160
- data["metadata"]["page_number"] = str(page_number)
161
-
162
- if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
163
- data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
164
-
165
- if data.get("metadata", {}).get("data_source", None):
166
- data.update(data.get("metadata", {}).pop("data_source", None))
167
- if data.get("metadata", {}).get("coordinates", None):
168
- data.update(data.get("metadata", {}).pop("coordinates", None))
169
- if data.get("metadata", {}):
170
- data.update(data.pop("metadata", None))
171
-
172
- @DestinationConnectionError.wrap
173
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
174
- logger.info(
175
- f"writing {len(elements_dict)} objects to database {self.connector_config.database} "
176
- f"at {self.connector_config.host}"
177
- )
178
-
179
- with self.client as conn:
180
- cursor = conn.cursor()
181
-
182
- # Since we have no guarantee that each element will have the same keys
183
- # we insert each element individually
184
- for elem in elements_dict:
185
- query = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(elem.keys())}) \
186
- VALUES({','.join(['?' if self.connector_config.db_type=='sqlite' else '%s' for x in elem])})" # noqa E501
187
- values = []
188
- for v in elem.values():
189
- if self.connector_config.db_type == "sqlite" and isinstance(v, list):
190
- values.append(json.dumps(v))
191
- else:
192
- values.append(v)
193
- cursor.execute(query, values)
194
-
195
- conn.commit()
196
- cursor.close()
197
-
198
- # Leaving contexts doesn't close the connection, so doing it here
199
- conn.close()
@@ -1,253 +0,0 @@
1
- import datetime
2
- import json
3
- import typing as t
4
- import uuid
5
- from dataclasses import dataclass, field
6
-
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
- from unstructured_ingest.error import DestinationConnectionError
9
- from unstructured_ingest.interfaces import (
10
- AccessConfig,
11
- BaseConnectorConfig,
12
- BaseDestinationConnector,
13
- BaseIngestDoc,
14
- WriteConfig,
15
- )
16
- from unstructured_ingest.logger import logger
17
- from unstructured_ingest.utils.data_prep import flatten_dict
18
- from unstructured_ingest.utils.dep_check import requires_dependencies
19
-
20
- BASE_URL = "https://api.vectara.io/v1"
21
-
22
-
23
- @dataclass
24
- class VectaraAccessConfig(AccessConfig):
25
- oauth_client_id: str = enhanced_field(sensitive=True)
26
- oauth_secret: str = enhanced_field(sensitive=True)
27
-
28
-
29
- @dataclass
30
- class SimpleVectaraConfig(BaseConnectorConfig):
31
- access_config: VectaraAccessConfig
32
- customer_id: str
33
- corpus_name: t.Optional[str] = None
34
- corpus_id: t.Optional[str] = None
35
- token_url: str = "https://vectara-prod-{}.auth.us-west-2.amazoncognito.com/oauth2/token"
36
-
37
-
38
- @dataclass
39
- class VectaraDestinationConnector(BaseDestinationConnector):
40
- write_config: WriteConfig
41
- connector_config: SimpleVectaraConfig
42
- _jwt_token: t.Optional[str] = field(init=False, default=None)
43
- _jwt_token_expires_ts: t.Optional[float] = field(init=False, default=None)
44
-
45
- @property
46
- def jwt_token(self):
47
- if (
48
- not self._jwt_token
49
- or self._jwt_token_expires_ts - datetime.datetime.now().timestamp() <= 60
50
- ):
51
- self._jwt_token = self._get_jwt_token()
52
- return self._jwt_token
53
-
54
- @DestinationConnectionError.wrap
55
- def vectara(self):
56
- """
57
- Check the connection for Vectara and validate corpus exists.
58
- - If more than one corpus with the same name exists - then return a message
59
- - If exactly one corpus exists with this name - use it.
60
- - If does not exist - create it.
61
- """
62
- try:
63
- # Get token if not already set
64
- self.jwt_token
65
-
66
- list_corpora_response = self._request(
67
- endpoint="list-corpora",
68
- data={"numResults": 1, "filter": self.connector_config.corpus_name},
69
- )
70
-
71
- possible_corpora_ids_names_map = {
72
- corpus.get("id"): corpus.get("name")
73
- for corpus in list_corpora_response.get("corpus")
74
- if corpus.get("name") == self.connector_config.corpus_name
75
- }
76
-
77
- if len(possible_corpora_ids_names_map) > 1:
78
- return f"Multiple Corpora exist with name {self.connector_config.corpus_name}"
79
- if len(possible_corpora_ids_names_map) == 1:
80
- self.connector_config.corpus_id = list(possible_corpora_ids_names_map.keys())[0]
81
- else:
82
- data = {
83
- "corpus": {
84
- "name": self.connector_config.corpus_name,
85
- }
86
- }
87
- create_corpus_response = self._request(endpoint="create-corpus", data=data)
88
- self.connector_config.corpus_id = create_corpus_response.get("corpusId")
89
-
90
- except Exception as e:
91
- logger.error(f"failed to create Vectara connection: {e}", exc_info=True)
92
- raise DestinationConnectionError(f"failed to create Vectara connection: {e}")
93
-
94
- def initialize(self):
95
- self.vectara()
96
-
97
- @requires_dependencies(["requests"], extras="vectara")
98
- def _request(
99
- self,
100
- endpoint: str,
101
- http_method: str = "POST",
102
- params: t.Mapping[str, t.Any] = None,
103
- data: t.Mapping[str, t.Any] = None,
104
- ):
105
- import requests
106
-
107
- url = f"{BASE_URL}/{endpoint}"
108
-
109
- headers = {
110
- "Content-Type": "application/json",
111
- "Accept": "application/json",
112
- "Authorization": f"Bearer {self.jwt_token}",
113
- "customer-id": self.connector_config.customer_id,
114
- "X-source": "unstructured",
115
- }
116
-
117
- response = requests.request(
118
- method=http_method, url=url, headers=headers, params=params, data=json.dumps(data)
119
- )
120
- response.raise_for_status()
121
- return response.json()
122
-
123
- # Get Oauth2 JWT token
124
- @requires_dependencies(["requests"], extras="vectara")
125
- def _get_jwt_token(self):
126
- import requests
127
-
128
- """Connect to the server and get a JWT token."""
129
- token_endpoint = self.connector_config.token_url.format(self.connector_config.customer_id)
130
- headers = {
131
- "Content-Type": "application/x-www-form-urlencoded",
132
- }
133
- data = {
134
- "grant_type": "client_credentials",
135
- "client_id": self.connector_config.access_config.oauth_client_id,
136
- "client_secret": self.connector_config.access_config.oauth_secret,
137
- }
138
-
139
- response = requests.request(method="POST", url=token_endpoint, headers=headers, data=data)
140
- response.raise_for_status()
141
- response_json = response.json()
142
-
143
- request_time = datetime.datetime.now().timestamp()
144
- self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
145
-
146
- return response_json.get("access_token")
147
-
148
- @DestinationConnectionError.wrap
149
- def check_connection(self):
150
- try:
151
- self.vectara()
152
- except Exception as e:
153
- logger.error(f"failed to validate connection: {e}", exc_info=True)
154
- raise DestinationConnectionError(f"failed to validate connection: {e}")
155
-
156
- def _delete_doc(self, doc_id: str) -> None:
157
- """
158
- Delete a document from the Vectara corpus.
159
-
160
- Args:
161
- url (str): URL of the page to delete.
162
- doc_id (str): ID of the document to delete.
163
- """
164
- body = {
165
- "customer_id": self.connector_config.customer_id,
166
- "corpus_id": self.connector_config.corpus_id,
167
- "document_id": doc_id,
168
- }
169
- self._request(endpoint="delete-doc", data=body)
170
-
171
- def _index_document(self, document: t.Dict[str, t.Any]) -> None:
172
- """
173
- Index a document (by uploading it to the Vectara corpus) from the document dictionary
174
- """
175
- body = {
176
- "customer_id": self.connector_config.customer_id,
177
- "corpus_id": self.connector_config.corpus_id,
178
- "document": document,
179
- }
180
-
181
- try:
182
- result = self._request(endpoint="index", data=body, http_method="POST")
183
- except Exception as e:
184
- logger.info(f"exception {e} while indexing document {document['documentId']}")
185
- return
186
-
187
- if (
188
- "status" in result
189
- and result["status"]
190
- and (
191
- "ALREADY_EXISTS" in result["status"]["code"]
192
- or (
193
- "CONFLICT" in result["status"]["code"]
194
- and "Indexing doesn't support updating documents"
195
- in result["status"]["statusDetail"]
196
- )
197
- )
198
- ):
199
- logger.info(f"document {document['documentId']} already exists, re-indexing")
200
- self._delete_doc(document["documentId"])
201
- result = self._request(endpoint="index", data=body, http_method="POST")
202
- return
203
-
204
- if "status" in result and result["status"] and "OK" in result["status"]["code"]:
205
- logger.info(f"indexing document {document['documentId']} succeeded")
206
- else:
207
- logger.info(f"indexing document {document['documentId']} failed, response = {result}")
208
-
209
- def write_dict(self, *args, docs_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
210
- logger.info(f"inserting / updating {len(docs_list)} documents to Vectara ")
211
- for vdoc in docs_list:
212
- self._index_document(vdoc)
213
-
214
- def write(self, docs: t.List[BaseIngestDoc]) -> None:
215
- docs_list: t.Dict[t.Dict[str, t.Any]] = []
216
-
217
- def get_metadata(element) -> t.Dict[str, t.Any]:
218
- """
219
- Select which meta-data fields to include and optionally map them to a new new.
220
- remove the "metadata-" prefix from the keys
221
- """
222
- metadata_map = {
223
- "page_number": "page_number",
224
- "data_source-url": "url",
225
- "filename": "filename",
226
- "filetype": "filetype",
227
- "last_modified": "last_modified",
228
- }
229
- md = flatten_dict(element, separator="-", flatten_lists=True)
230
- md = {k.replace("metadata-", ""): v for k, v in md.items()}
231
- md = {metadata_map[k]: v for k, v in md.items() if k in metadata_map}
232
- return md
233
-
234
- for doc in docs:
235
- local_path = doc._output_filename
236
- with open(local_path) as json_file:
237
- dict_content = json.load(json_file)
238
- vdoc = {
239
- "documentId": str(uuid.uuid4()),
240
- "title": dict_content[0].get("metadata", {}).get("data_source", {}).get("url"),
241
- "section": [
242
- {
243
- "text": element.pop("text", None),
244
- "metadataJson": json.dumps(get_metadata(element)),
245
- }
246
- for element in dict_content
247
- ],
248
- }
249
- logger.info(
250
- f"Extending {len(vdoc)} json elements from content in {local_path}",
251
- )
252
- docs_list.append(vdoc)
253
- self.write_dict(docs_list=docs_list)