unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
  279. unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,166 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass, field
3
- from datetime import datetime
4
- from pathlib import Path
5
-
6
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
8
- from unstructured_ingest.interfaces import (
9
- AccessConfig,
10
- BaseConnectorConfig,
11
- BaseSingleIngestDoc,
12
- BaseSourceConnector,
13
- IngestDocCleanupMixin,
14
- SourceConnectorCleanupMixin,
15
- SourceMetadata,
16
- )
17
- from unstructured_ingest.logger import logger
18
- from unstructured_ingest.utils.dep_check import requires_dependencies
19
-
20
- if t.TYPE_CHECKING:
21
- from praw import Reddit
22
-
23
-
24
- @dataclass
25
- class RedditAccessConfig(AccessConfig):
26
- client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
27
-
28
-
29
- @dataclass
30
- class SimpleRedditConfig(BaseConnectorConfig):
31
- access_config: RedditAccessConfig
32
- subreddit_name: str
33
- num_posts: int
34
- user_agent: str
35
- client_id: str
36
- search_query: t.Optional[str] = None
37
-
38
- def __post_init__(self):
39
- if self.num_posts <= 0:
40
- raise ValueError("The number of Reddit posts to fetch must be positive.")
41
-
42
-
43
- @dataclass
44
- class RedditIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
45
- connector_config: SimpleRedditConfig = field(repr=False)
46
- post_id: str
47
- registry_name: str = "reddit"
48
-
49
- def _create_full_tmp_dir_path(self):
50
- self.filename.parent.mkdir(parents=True, exist_ok=True)
51
-
52
- @SourceConnectionNetworkError.wrap
53
- @requires_dependencies(["praw"])
54
- def get_post(self):
55
- from praw import Reddit
56
- from praw.models import Submission
57
-
58
- reddit = Reddit(
59
- client_id=self.connector_config.client_id,
60
- client_secret=self.connector_config.access_config.client_secret,
61
- user_agent=self.connector_config.user_agent,
62
- )
63
- post = Submission(reddit, self.post_id)
64
- return post
65
-
66
- def update_source_metadata(self, **kwargs):
67
- post = kwargs.get("post", self.get_post())
68
- if post is None:
69
- self.source_metadata = SourceMetadata(
70
- exists=False,
71
- )
72
- return
73
-
74
- file_exists = (post.author != "[deleted]" or post.auth is not None) and (
75
- post.selftext != "[deleted]" or post.selftext != "[removed]"
76
- )
77
-
78
- self.source_metadata = SourceMetadata(
79
- date_created=datetime.utcfromtimestamp(post.created_utc).isoformat(),
80
- source_url=post.permalink,
81
- exists=file_exists,
82
- )
83
-
84
- @SourceConnectionError.wrap
85
- @BaseSingleIngestDoc.skip_if_file_exists
86
- def get_file(self):
87
- """Fetches the "remote" doc and stores it locally on the filesystem."""
88
- self._create_full_tmp_dir_path()
89
- # Write the title plus the body, if any
90
- post = self.get_post()
91
- self.update_source_metadata(post=post)
92
- if post is None:
93
- raise ValueError(
94
- f"Failed to retrieve post {self.post_id}",
95
- )
96
-
97
- text_to_write = f"# {post.title}\n{post.selftext}"
98
- with open(self.filename, "w", encoding="utf8") as f:
99
- f.write(text_to_write)
100
-
101
- @property
102
- def filename(self) -> Path:
103
- return (Path(self.read_config.download_dir) / f"{self.post_id}.md").resolve()
104
-
105
- @property
106
- def _output_filename(self):
107
- return Path(self.processor_config.output_dir) / f"{self.post_id}.json"
108
-
109
- @property
110
- def date_modified(self) -> t.Optional[str]:
111
- return None
112
-
113
- @property
114
- def version(self) -> t.Optional[str]:
115
- return None
116
-
117
-
118
- @dataclass
119
- class RedditSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
120
- connector_config: SimpleRedditConfig
121
- _reddit: t.Optional["Reddit"] = field(init=False, default=None)
122
-
123
- @property
124
- def reddit(self) -> "Reddit":
125
- from praw import Reddit
126
-
127
- if self._reddit is None:
128
- self._reddit = Reddit(
129
- client_id=self.connector_config.client_id,
130
- client_secret=self.connector_config.access_config.client_secret,
131
- user_agent=self.connector_config.user_agent,
132
- )
133
- return self._reddit
134
-
135
- @requires_dependencies(["praw"], extras="reddit")
136
- def initialize(self):
137
- _ = self.reddit
138
-
139
- def check_connection(self):
140
- from praw.endpoints import API_PATH
141
- from prawcore import ResponseException
142
-
143
- try:
144
- self.reddit._objectify_request(method="HEAD", params=None, path=API_PATH["me"])
145
- except ResponseException as response_error:
146
- logger.error(f"failed to validate connection: {response_error}", exc_info=True)
147
- raise SourceConnectionError(f"failed to validate connection: {response_error}")
148
-
149
- def get_ingest_docs(self):
150
- subreddit = self.reddit.subreddit(self.connector_config.subreddit_name)
151
- if self.connector_config.search_query:
152
- posts = subreddit.search(
153
- self.connector_config.search_query,
154
- limit=self.connector_config.num_posts,
155
- )
156
- else:
157
- posts = subreddit.hot(limit=self.connector_config.num_posts)
158
- return [
159
- RedditIngestDoc(
160
- connector_config=self.connector_config,
161
- processor_config=self.processor_config,
162
- read_config=self.read_config,
163
- post_id=post.id,
164
- )
165
- for post in posts
166
- ]
@@ -1,109 +0,0 @@
1
- import json
2
- from typing import Dict, Type, cast
3
-
4
- from unstructured_ingest.connector.airtable import AirtableIngestDoc
5
- from unstructured_ingest.connector.astradb import AstraDBIngestDoc
6
- from unstructured_ingest.connector.biomed import BiomedIngestDoc
7
- from unstructured_ingest.connector.confluence import ConfluenceIngestDoc
8
- from unstructured_ingest.connector.delta_table import DeltaTableIngestDoc
9
- from unstructured_ingest.connector.discord import DiscordIngestDoc
10
- from unstructured_ingest.connector.elasticsearch import (
11
- ElasticsearchIngestDoc,
12
- ElasticsearchIngestDocBatch,
13
- )
14
- from unstructured_ingest.connector.fsspec.azure import AzureBlobStorageIngestDoc
15
- from unstructured_ingest.connector.fsspec.box import BoxIngestDoc
16
- from unstructured_ingest.connector.fsspec.dropbox import DropboxIngestDoc
17
- from unstructured_ingest.connector.fsspec.gcs import GcsIngestDoc
18
- from unstructured_ingest.connector.fsspec.s3 import S3IngestDoc
19
- from unstructured_ingest.connector.fsspec.sftp import SftpIngestDoc
20
- from unstructured_ingest.connector.github import GitHubIngestDoc
21
- from unstructured_ingest.connector.gitlab import GitLabIngestDoc
22
- from unstructured_ingest.connector.google_drive import GoogleDriveIngestDoc
23
- from unstructured_ingest.connector.hubspot import HubSpotIngestDoc
24
- from unstructured_ingest.connector.jira import JiraIngestDoc
25
- from unstructured_ingest.connector.kafka import KafkaIngestDoc
26
- from unstructured_ingest.connector.local import LocalIngestDoc
27
- from unstructured_ingest.connector.mongodb import MongoDBIngestDoc, MongoDBIngestDocBatch
28
- from unstructured_ingest.connector.notion.connector import (
29
- NotionDatabaseIngestDoc,
30
- NotionPageIngestDoc,
31
- )
32
- from unstructured_ingest.connector.onedrive import OneDriveIngestDoc
33
- from unstructured_ingest.connector.opensearch import OpenSearchIngestDoc, OpenSearchIngestDocBatch
34
- from unstructured_ingest.connector.outlook import OutlookIngestDoc
35
- from unstructured_ingest.connector.reddit import RedditIngestDoc
36
- from unstructured_ingest.connector.salesforce import SalesforceIngestDoc
37
- from unstructured_ingest.connector.sharepoint import SharepointIngestDoc
38
- from unstructured_ingest.connector.slack import SlackIngestDoc
39
- from unstructured_ingest.connector.wikipedia import (
40
- WikipediaIngestHTMLDoc,
41
- WikipediaIngestSummaryDoc,
42
- WikipediaIngestTextDoc,
43
- )
44
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
45
- from unstructured_ingest.interfaces import BaseIngestDoc
46
-
47
- INGEST_DOC_NAME_TO_CLASS: Dict[str, Type[EnhancedDataClassJsonMixin]] = {
48
- "airtable": AirtableIngestDoc,
49
- "astradb": AstraDBIngestDoc,
50
- "azure": AzureBlobStorageIngestDoc,
51
- "biomed": BiomedIngestDoc,
52
- "box": BoxIngestDoc,
53
- "confluence": ConfluenceIngestDoc,
54
- "delta-table": DeltaTableIngestDoc,
55
- "discord": DiscordIngestDoc,
56
- "dropbox": DropboxIngestDoc,
57
- "elasticsearch": ElasticsearchIngestDoc,
58
- "elasticsearch_batch": ElasticsearchIngestDocBatch,
59
- "gcs": GcsIngestDoc,
60
- "github": GitHubIngestDoc,
61
- "gitlab": GitLabIngestDoc,
62
- "google_drive": GoogleDriveIngestDoc,
63
- "hubspot": HubSpotIngestDoc,
64
- "jira": JiraIngestDoc,
65
- "kafka": KafkaIngestDoc,
66
- "local": LocalIngestDoc,
67
- "mongodb": MongoDBIngestDoc,
68
- "mongodb_batch": MongoDBIngestDocBatch,
69
- "notion_database": NotionDatabaseIngestDoc,
70
- "notion_page": NotionPageIngestDoc,
71
- "onedrive": OneDriveIngestDoc,
72
- "opensearch": OpenSearchIngestDoc,
73
- "opensearch_batch": OpenSearchIngestDocBatch,
74
- "outlook": OutlookIngestDoc,
75
- "reddit": RedditIngestDoc,
76
- "s3": S3IngestDoc,
77
- "salesforce": SalesforceIngestDoc,
78
- "sftp": SftpIngestDoc,
79
- "sharepoint": SharepointIngestDoc,
80
- "slack": SlackIngestDoc,
81
- "wikipedia_html": WikipediaIngestHTMLDoc,
82
- "wikipedia_text": WikipediaIngestTextDoc,
83
- "wikipedia_summary": WikipediaIngestSummaryDoc,
84
- }
85
-
86
-
87
- def create_ingest_doc_from_json(ingest_doc_json: str) -> BaseIngestDoc:
88
- try:
89
- ingest_doc_dict: dict = json.loads(ingest_doc_json)
90
- except TypeError as te:
91
- raise TypeError(
92
- f"failed to load json string when deserializing IngestDoc: {ingest_doc_json}",
93
- ) from te
94
- return create_ingest_doc_from_dict(ingest_doc_dict)
95
-
96
-
97
- def create_ingest_doc_from_dict(ingest_doc_dict: dict) -> BaseIngestDoc:
98
- ingest_doc_dict = ingest_doc_dict.copy()
99
- if "registry_name" not in ingest_doc_dict:
100
- raise ValueError(f"registry_name not present in ingest doc: {ingest_doc_dict}")
101
- registry_name = ingest_doc_dict.pop("registry_name")
102
- try:
103
- ingest_doc_cls = INGEST_DOC_NAME_TO_CLASS[registry_name]
104
- return cast(BaseIngestDoc, ingest_doc_cls.from_dict(ingest_doc_dict))
105
- except KeyError:
106
- raise ValueError(
107
- f"Error: Received unknown IngestDoc name: {registry_name} while deserializing",
108
- "IngestDoc.",
109
- )
@@ -1,301 +0,0 @@
1
- """
2
- Salesforce Connector
3
- Able to download Account, Case, Campaign, EmailMessage, Lead
4
- Salesforce returns everything as a list of json.
5
- This saves each entry as a separate file to be partitioned.
6
- Using JWT authorization
7
- https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm
8
- https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm
9
- """
10
-
11
- import json
12
- import typing as t
13
- from collections import OrderedDict
14
- from dataclasses import dataclass, field
15
- from datetime import datetime
16
- from email.utils import formatdate
17
- from pathlib import Path
18
- from string import Template
19
- from textwrap import dedent
20
-
21
- from unstructured_ingest.enhanced_dataclass import enhanced_field
22
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
23
- from unstructured_ingest.interfaces import (
24
- AccessConfig,
25
- BaseConnectorConfig,
26
- BaseSingleIngestDoc,
27
- BaseSourceConnector,
28
- IngestDocCleanupMixin,
29
- SourceConnectorCleanupMixin,
30
- SourceMetadata,
31
- )
32
- from unstructured_ingest.logger import logger
33
- from unstructured_ingest.utils.dep_check import requires_dependencies
34
-
35
-
36
- class MissingCategoryError(Exception):
37
- """There are no categories with that name."""
38
-
39
-
40
- SALESFORCE_API_VERSION = "57.0"
41
-
42
- ACCEPTED_CATEGORIES = ["Account", "Case", "Campaign", "EmailMessage", "Lead"]
43
-
44
- EMAIL_TEMPLATE = Template(
45
- """MIME-Version: 1.0
46
- Date: $date
47
- Message-ID: $message_identifier
48
- Subject: $subject
49
- From: $from_email
50
- To: $to_email
51
- Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
52
- --00000000000095c9b205eff92630
53
- Content-Type: text/plain; charset="UTF-8"
54
- $textbody
55
- --00000000000095c9b205eff92630
56
- Content-Type: text/html; charset="UTF-8"
57
- $htmlbody
58
- --00000000000095c9b205eff92630--
59
- """,
60
- )
61
-
62
-
63
- @dataclass
64
- class SalesforceAccessConfig(AccessConfig):
65
- consumer_key: str = enhanced_field(sensitive=True)
66
- private_key: str = enhanced_field(sensitive=True)
67
-
68
- @requires_dependencies(["cryptography"])
69
- def get_private_key_value_and_type(self) -> t.Tuple[str, t.Type]:
70
- from cryptography.hazmat.primitives import serialization
71
-
72
- try:
73
- serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None)
74
- except ValueError:
75
- pass
76
- else:
77
- return self.private_key, str
78
-
79
- if Path(self.private_key).is_file():
80
- return self.private_key, Path
81
-
82
- raise ValueError("private_key does not contain PEM private key or path")
83
-
84
-
85
- @dataclass
86
- class SimpleSalesforceConfig(BaseConnectorConfig):
87
- """Connector specific attributes"""
88
-
89
- access_config: SalesforceAccessConfig
90
- categories: t.List[str]
91
- username: str
92
- recursive: bool = False
93
-
94
- @requires_dependencies(["simple_salesforce"], extras="salesforce")
95
- def get_client(self):
96
- from simple_salesforce import Salesforce
97
-
98
- pkey_value, pkey_type = self.access_config.get_private_key_value_and_type()
99
-
100
- return Salesforce(
101
- username=self.username,
102
- consumer_key=self.access_config.consumer_key,
103
- privatekey_file=pkey_value if pkey_type is Path else None,
104
- privatekey=pkey_value if pkey_type is str else None,
105
- version=SALESFORCE_API_VERSION,
106
- )
107
-
108
-
109
- @dataclass
110
- class SalesforceIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
111
- connector_config: SimpleSalesforceConfig
112
- record_type: str
113
- record_id: str
114
- registry_name: str = "salesforce"
115
- _record: OrderedDict = field(default_factory=lambda: OrderedDict())
116
-
117
- @property
118
- def record(self):
119
- if not self._record:
120
- self._record = self.get_record()
121
- return self._record
122
-
123
- def get_file_extension(self) -> str:
124
- if self.record_type == "EmailMessage":
125
- extension = ".eml"
126
- elif self.record_type in ["Account", "Lead", "Case", "Campaign"]:
127
- extension = ".xml"
128
- else:
129
- raise MissingCategoryError(
130
- f"There are no categories with the name: {self.record_type}",
131
- )
132
- return extension
133
-
134
- def _tmp_download_file(self) -> Path:
135
- record_file = self.record_id + self.get_file_extension()
136
- return Path(self.read_config.download_dir) / self.record_type / record_file
137
-
138
- @property
139
- def _output_filename(self) -> Path:
140
- record_file = self.record_id + self.get_file_extension() + ".json"
141
- return Path(self.processor_config.output_dir) / self.record_type / record_file
142
-
143
- def _create_full_tmp_dir_path(self):
144
- self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
145
-
146
- def _xml_for_record(self, record: OrderedDict) -> str:
147
- """Creates partitionable xml file from a record"""
148
- import xml.etree.ElementTree as ET
149
-
150
- def flatten_dict(data, parent, prefix=""):
151
- for key, value in data.items():
152
- if isinstance(value, OrderedDict):
153
- flatten_dict(value, parent, prefix=f"{prefix}{key}.")
154
- else:
155
- item = ET.Element("item")
156
- item.text = f"{prefix}{key}: {value}"
157
- parent.append(item)
158
-
159
- root = ET.Element("root")
160
- flatten_dict(record, root)
161
- xml_string = ET.tostring(root, encoding="utf-8", xml_declaration=True).decode()
162
- return xml_string
163
-
164
- def _eml_for_record(self, email_json: t.Dict[str, t.Any]) -> str:
165
- from dateutil import parser # type: ignore
166
-
167
- """Recreates standard expected .eml format using template."""
168
- eml = EMAIL_TEMPLATE.substitute(
169
- date=formatdate(parser.parse(email_json.get("MessageDate")).timestamp()),
170
- message_identifier=email_json.get("MessageIdentifier"),
171
- subject=email_json.get("Subject"),
172
- from_email=email_json.get("FromAddress"),
173
- to_email=email_json.get("ToAddress"),
174
- textbody=email_json.get("TextBody"),
175
- # TODO: This is a hack to get emails to process correctly.
176
- # The HTML partitioner seems to have issues with <br> and text without tags like <p>
177
- htmlbody=email_json.get("HtmlBody", "") # "" because you can't .replace None
178
- .replace("<br />", "<p>")
179
- .replace("<body", "<body><p"),
180
- )
181
- return dedent(eml)
182
-
183
- @SourceConnectionNetworkError.wrap
184
- def _get_response(self):
185
- client = self.connector_config.get_client()
186
- return client.query_all(
187
- f"select FIELDS(STANDARD) from {self.record_type} where Id='{self.record_id}'",
188
- )
189
-
190
- def get_record(self) -> OrderedDict:
191
- # Get record from Salesforce based on id
192
- response = self._get_response()
193
- logger.debug(f"response was returned for salesforce record id: {self.record_id}")
194
- records = response["records"]
195
- if not records:
196
- raise ValueError(
197
- f"No record found with record id {self.record_id}: {json.dumps(response)}"
198
- )
199
- record_json = records[0]
200
- return record_json
201
-
202
- def update_source_metadata(self) -> None: # type: ignore
203
- record_json = self.record
204
-
205
- date_format = "%Y-%m-%dT%H:%M:%S.000+0000"
206
- self.source_metadata = SourceMetadata(
207
- date_created=datetime.strptime(record_json["CreatedDate"], date_format).isoformat(),
208
- date_modified=datetime.strptime(
209
- record_json["LastModifiedDate"],
210
- date_format,
211
- ).isoformat(),
212
- # SystemModstamp is Timestamp if record has been modified by person or automated system
213
- version=record_json.get("SystemModstamp"),
214
- source_url=record_json["attributes"].get("url"),
215
- exists=True,
216
- )
217
-
218
- @SourceConnectionError.wrap
219
- @BaseSingleIngestDoc.skip_if_file_exists
220
- def get_file(self):
221
- """Saves individual json records locally."""
222
- self._create_full_tmp_dir_path()
223
- record = self.record
224
-
225
- self.update_source_metadata()
226
-
227
- try:
228
- if self.record_type == "EmailMessage":
229
- document = self._eml_for_record(record)
230
- else:
231
- document = self._xml_for_record(record)
232
-
233
- with open(self._tmp_download_file(), "w") as page_file:
234
- page_file.write(document)
235
-
236
- except Exception as e:
237
- logger.error(
238
- f"Error while downloading and saving file: {self.record_id}.",
239
- )
240
- logger.error(e)
241
-
242
- @property
243
- def filename(self):
244
- """The filename of the file created from a Salesforce record"""
245
- return self._tmp_download_file()
246
-
247
-
248
- @dataclass
249
- class SalesforceSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
250
- connector_config: SimpleSalesforceConfig
251
-
252
- def __post_init__(self):
253
- self.ingest_doc_cls: t.Type[SalesforceIngestDoc] = SalesforceIngestDoc
254
-
255
- def initialize(self):
256
- pass
257
-
258
- @requires_dependencies(["simple_salesforce"], extras="salesforce")
259
- def check_connection(self):
260
- from simple_salesforce.exceptions import SalesforceError
261
-
262
- try:
263
- self.connector_config.get_client()
264
- except SalesforceError as salesforce_error:
265
- logger.error(f"failed to validate connection: {salesforce_error}", exc_info=True)
266
- raise SourceConnectionError(f"failed to validate connection: {salesforce_error}")
267
-
268
- @requires_dependencies(["simple_salesforce"], extras="salesforce")
269
- def get_ingest_docs(self) -> t.List[SalesforceIngestDoc]:
270
- """Get Salesforce Ids for the records.
271
- Send them to next phase where each doc gets downloaded into the
272
- appropriate format for partitioning.
273
- """
274
- from simple_salesforce.exceptions import SalesforceMalformedRequest
275
-
276
- client = self.connector_config.get_client()
277
-
278
- ingest_docs = []
279
- for record_type in self.connector_config.categories:
280
- if record_type not in ACCEPTED_CATEGORIES:
281
- raise ValueError(f"{record_type} not currently an accepted Salesforce category")
282
-
283
- try:
284
- # Get ids from Salesforce
285
- records = client.query_all(
286
- f"select Id from {record_type}",
287
- )
288
- for record in records["records"]:
289
- ingest_docs.append(
290
- SalesforceIngestDoc(
291
- connector_config=self.connector_config,
292
- processor_config=self.processor_config,
293
- read_config=self.read_config,
294
- record_type=record_type,
295
- record_id=record["Id"],
296
- ),
297
- )
298
- except SalesforceMalformedRequest as e:
299
- raise SalesforceMalformedRequest(f"Problem with Salesforce query: {e}")
300
-
301
- return ingest_docs