unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,34 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.kafka import SimpleKafkaConfig
12
-
13
-
14
- @dataclass
15
- class KafkaRunner(Runner):
16
- connector_config: "SimpleKafkaConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- str(self.connector_config.bootstrap_server).encode("utf-8"),
21
- )
22
- self.read_config.download_dir = update_download_dir_hash(
23
- connector_name="kafka",
24
- read_config=self.read_config,
25
- hashed_dir_name=hashed_dir_name,
26
- logger=logger,
27
- )
28
-
29
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
30
- from unstructured_ingest.connector.kafka import (
31
- KafkaSourceConnector,
32
- )
33
-
34
- return KafkaSourceConnector
@@ -1,23 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.interfaces import BaseSourceConnector
5
- from unstructured_ingest.runner.base_runner import Runner
6
-
7
- if t.TYPE_CHECKING:
8
- from unstructured_ingest.connector.local import SimpleLocalConfig
9
-
10
-
11
- @dataclass
12
- class LocalRunner(Runner):
13
- connector_config: "SimpleLocalConfig"
14
-
15
- def update_read_config(self):
16
- pass
17
-
18
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
19
- from unstructured_ingest.connector.local import (
20
- LocalSourceConnector,
21
- )
22
-
23
- return LocalSourceConnector
@@ -1,34 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.mongodb import SimpleMongoDBConfig
12
-
13
-
14
- @dataclass
15
- class MongoDBRunner(Runner):
16
- connector_config: "SimpleMongoDBConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- str(self.connector_config.access_config.uri).encode("utf-8"),
21
- )
22
- self.read_config.download_dir = update_download_dir_hash(
23
- connector_name="mongodb",
24
- read_config=self.read_config,
25
- hashed_dir_name=hashed_dir_name,
26
- logger=logger,
27
- )
28
-
29
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
30
- from unstructured_ingest.connector.mongodb import (
31
- MongoDBSourceConnector,
32
- )
33
-
34
- return MongoDBSourceConnector
@@ -1,61 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.notion.connector import SimpleNotionConfig
12
-
13
-
14
- @dataclass
15
- class NotionRunner(Runner):
16
- connector_config: "SimpleNotionConfig"
17
-
18
- def update_read_config(self):
19
- if not self.connector_config.page_ids and not self.connector_config.database_ids:
20
- raise ValueError("no page ids nor database ids provided")
21
-
22
- if self.connector_config.page_ids and self.connector_config.database_ids:
23
- hashed_dir_name = hashlib.sha256(
24
- "{},{}".format(
25
- ",".join(self.connector_config.page_ids),
26
- ",".join(self.connector_config.database_ids),
27
- ).encode("utf-8"),
28
- )
29
- elif self.connector_config.page_ids:
30
- hashed_dir_name = hashlib.sha256(
31
- ",".join(self.connector_config.page_ids).encode("utf-8"),
32
- )
33
- elif self.connector_config.database_ids:
34
- hashed_dir_name = hashlib.sha256(
35
- ",".join(self.connector_config.database_ids).encode("utf-8"),
36
- )
37
- else:
38
- raise ValueError("could not create local cache directory name")
39
-
40
- self.read_config.download_dir = update_download_dir_hash(
41
- connector_name="notion",
42
- read_config=self.read_config,
43
- hashed_dir_name=hashed_dir_name,
44
- logger=logger,
45
- )
46
-
47
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
48
- from unstructured_ingest.connector.notion.connector import (
49
- NotionSourceConnector,
50
- )
51
-
52
- return NotionSourceConnector
53
-
54
- def get_source_connector(self) -> BaseSourceConnector:
55
- source_connector_cls = self.get_source_connector_cls()
56
- return source_connector_cls(
57
- processor_config=self.processor_config,
58
- connector_config=self.connector_config,
59
- read_config=self.read_config,
60
- retry_strategy_config=self.retry_strategy_config,
61
- )
@@ -1,35 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.onedrive import SimpleOneDriveConfig
12
-
13
-
14
- @dataclass
15
- class OneDriveRunner(Runner):
16
- connector_config: "SimpleOneDriveConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- f"{self.connector_config.tenant}_{self.connector_config.user_pname}".encode("utf-8"),
21
- )
22
-
23
- self.read_config.download_dir = update_download_dir_hash(
24
- connector_name="onedrive",
25
- read_config=self.read_config,
26
- hashed_dir_name=hashed_dir_name,
27
- logger=logger,
28
- )
29
-
30
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
- from unstructured_ingest.connector.onedrive import (
32
- OneDriveSourceConnector,
33
- )
34
-
35
- return OneDriveSourceConnector
@@ -1,40 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.opensearch import SimpleOpenSearchConfig
12
-
13
-
14
- @dataclass
15
- class OpenSearchRunner(Runner):
16
- connector_config: "SimpleOpenSearchConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- "{}_{}".format(
21
- ",".join(self.connector_config.access_config.hosts),
22
- self.connector_config.index_name,
23
- ).encode(
24
- "utf-8",
25
- ),
26
- )
27
-
28
- self.read_config.download_dir = update_download_dir_hash(
29
- connector_name="opensearch",
30
- read_config=self.read_config,
31
- hashed_dir_name=hashed_dir_name,
32
- logger=logger,
33
- )
34
-
35
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
36
- from unstructured_ingest.connector.opensearch import (
37
- OpenSearchSourceConnector,
38
- )
39
-
40
- return OpenSearchSourceConnector
@@ -1,33 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.outlook import SimpleOutlookConfig
12
-
13
-
14
- @dataclass
15
- class OutlookRunner(Runner):
16
- connector_config: "SimpleOutlookConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(self.connector_config.user_email.encode("utf-8"))
20
-
21
- self.read_config.download_dir = update_download_dir_hash(
22
- connector_name="outlook",
23
- read_config=self.read_config,
24
- hashed_dir_name=hashed_dir_name,
25
- logger=logger,
26
- )
27
-
28
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
29
- from unstructured_ingest.connector.outlook import (
30
- OutlookSourceConnector,
31
- )
32
-
33
- return OutlookSourceConnector
@@ -1,35 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.reddit import SimpleRedditConfig
12
-
13
-
14
- @dataclass
15
- class RedditRunner(Runner):
16
- connector_config: "SimpleRedditConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- self.connector_config.subreddit_name.encode("utf-8"),
21
- )
22
-
23
- self.read_config.download_dir = update_download_dir_hash(
24
- connector_name="reddit",
25
- read_config=self.read_config,
26
- hashed_dir_name=hashed_dir_name,
27
- logger=logger,
28
- )
29
-
30
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
- from unstructured_ingest.connector.reddit import (
32
- RedditSourceConnector,
33
- )
34
-
35
- return RedditSourceConnector
@@ -1,33 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.salesforce import SimpleSalesforceConfig
12
-
13
-
14
- @dataclass
15
- class SalesforceRunner(Runner):
16
- connector_config: "SimpleSalesforceConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(self.connector_config.username.encode("utf-8"))
20
-
21
- self.read_config.download_dir = update_download_dir_hash(
22
- connector_name="salesforce",
23
- read_config=self.read_config,
24
- hashed_dir_name=hashed_dir_name,
25
- logger=logger,
26
- )
27
-
28
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
29
- from unstructured_ingest.connector.salesforce import (
30
- SalesforceSourceConnector,
31
- )
32
-
33
- return SalesforceSourceConnector
@@ -1,35 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.sharepoint import SimpleSharepointConfig
12
-
13
-
14
- @dataclass
15
- class SharePointRunner(Runner):
16
- connector_config: "SimpleSharepointConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- f"{self.connector_config.site}_{self.connector_config.path}".encode("utf-8"),
21
- )
22
-
23
- self.read_config.download_dir = update_download_dir_hash(
24
- connector_name="sharepoint",
25
- read_config=self.read_config,
26
- hashed_dir_name=hashed_dir_name,
27
- logger=logger,
28
- )
29
-
30
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
- from unstructured_ingest.connector.sharepoint import (
32
- SharepointSourceConnector,
33
- )
34
-
35
- return SharepointSourceConnector
@@ -1,33 +0,0 @@
1
- import hashlib
2
- import typing as t
3
-
4
- from unstructured_ingest.interfaces import BaseSourceConnector
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.runner.base_runner import Runner
7
- from unstructured_ingest.runner.utils import update_download_dir_hash
8
-
9
- if t.TYPE_CHECKING:
10
- from unstructured_ingest.connector.slack import SimpleSlackConfig
11
-
12
-
13
- class SlackRunner(Runner):
14
- connector_config: "SimpleSlackConfig"
15
-
16
- def update_read_config(self):
17
- hashed_dir_name = hashlib.sha256(
18
- ",".join(self.connector_config.channels).encode("utf-8"),
19
- )
20
-
21
- self.read_config.download_dir = update_download_dir_hash(
22
- connector_name="slack",
23
- read_config=self.read_config,
24
- hashed_dir_name=hashed_dir_name,
25
- logger=logger,
26
- )
27
-
28
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
29
- from unstructured_ingest.connector.slack import (
30
- SlackSourceConnector,
31
- )
32
-
33
- return SlackSourceConnector
@@ -1,47 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import hashlib
4
- import logging
5
- from pathlib import Path
6
-
7
- from unstructured_ingest.interfaces import (
8
- ReadConfig,
9
- )
10
-
11
-
12
- def update_download_dir_remote_url(
13
- connector_name: str,
14
- read_config: ReadConfig,
15
- remote_url: str,
16
- logger: logging.Logger,
17
- ) -> str:
18
- hashed_dir_name = hashlib.sha256(remote_url.encode("utf-8"))
19
- return update_download_dir_hash(
20
- connector_name=connector_name,
21
- read_config=read_config,
22
- hashed_dir_name=hashed_dir_name,
23
- logger=logger,
24
- )
25
-
26
-
27
- def update_download_dir_hash(
28
- connector_name: str,
29
- read_config: ReadConfig,
30
- hashed_dir_name: hashlib._Hash,
31
- logger: logging.Logger,
32
- ) -> str:
33
- if not read_config.download_dir:
34
- cache_path = Path.home() / ".cache" / "unstructured" / "ingest"
35
- if not cache_path.exists():
36
- cache_path.mkdir(parents=True, exist_ok=True)
37
- download_dir = cache_path / connector_name / hashed_dir_name.hexdigest()[:10]
38
- if read_config.preserve_downloads:
39
- logger.warning(
40
- f"Preserving downloaded files but download_dir is not specified,"
41
- f" using {download_dir}",
42
- )
43
- new_download_dir = str(download_dir)
44
- logger.debug(f"updating download directory to: {new_download_dir}")
45
- else:
46
- new_download_dir = read_config.download_dir
47
- return new_download_dir
@@ -1,35 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.wikipedia import SimpleWikipediaConfig
12
-
13
-
14
- @dataclass
15
- class WikipediaRunner(Runner):
16
- connector_config: "SimpleWikipediaConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- self.connector_config.page_title.encode("utf-8"),
21
- )
22
-
23
- self.read_config.download_dir = update_download_dir_hash(
24
- connector_name="wikipedia",
25
- read_config=self.read_config,
26
- hashed_dir_name=hashed_dir_name,
27
- logger=logger,
28
- )
29
-
30
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
- from unstructured_ingest.connector.wikipedia import (
32
- WikipediaSourceConnector,
33
- )
34
-
35
- return WikipediaSourceConnector
@@ -1,48 +0,0 @@
1
- import typing as t
2
-
3
- from .astradb import AstraDBWriter
4
- from .azure_ai_search import AzureAiSearchWriter
5
- from .base_writer import Writer
6
- from .chroma import ChromaWriter
7
- from .clarifai import ClarifaiWriter
8
- from .databricks_volumes import DatabricksVolumesWriter
9
- from .delta_table import DeltaTableWriter
10
- from .elasticsearch import ElasticsearchWriter
11
- from .fsspec.azure import AzureWriter
12
- from .fsspec.box import BoxWriter
13
- from .fsspec.dropbox import DropboxWriter
14
- from .fsspec.gcs import GcsWriter
15
- from .fsspec.s3 import S3Writer
16
- from .kafka import KafkaWriter
17
- from .mongodb import MongodbWriter
18
- from .opensearch import OpenSearchWriter
19
- from .pinecone import PineconeWriter
20
- from .qdrant import QdrantWriter
21
- from .sql import SqlWriter
22
- from .vectara import VectaraWriter
23
- from .weaviate import WeaviateWriter
24
-
25
- writer_map: t.Dict[str, t.Type[Writer]] = {
26
- "astradb": AstraDBWriter,
27
- "azure": AzureWriter,
28
- "azure_ai_search": AzureAiSearchWriter,
29
- "box": BoxWriter,
30
- "chroma": ChromaWriter,
31
- "clarifai": ClarifaiWriter,
32
- "databricks_volumes": DatabricksVolumesWriter,
33
- "delta_table": DeltaTableWriter,
34
- "dropbox": DropboxWriter,
35
- "elasticsearch": ElasticsearchWriter,
36
- "gcs": GcsWriter,
37
- "kafka": KafkaWriter,
38
- "mongodb": MongodbWriter,
39
- "opensearch": OpenSearchWriter,
40
- "pinecone": PineconeWriter,
41
- "qdrant": QdrantWriter,
42
- "s3": S3Writer,
43
- "sql": SqlWriter,
44
- "vectara": VectaraWriter,
45
- "weaviate": WeaviateWriter,
46
- }
47
-
48
- __all__ = ["writer_map"]
@@ -1,22 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
5
- from unstructured_ingest.interfaces import BaseDestinationConnector
6
- from unstructured_ingest.runner.writers.base_writer import Writer
7
-
8
- if t.TYPE_CHECKING:
9
- from unstructured_ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
10
-
11
-
12
- @dataclass
13
- class AstraDBWriter(Writer, EnhancedDataClassJsonMixin):
14
- write_config: "AstraDBWriteConfig"
15
- connector_config: "SimpleAstraDBConfig"
16
-
17
- def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
18
- from unstructured_ingest.connector.astradb import (
19
- AstraDBDestinationConnector,
20
- )
21
-
22
- return AstraDBDestinationConnector
@@ -1,24 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.interfaces import BaseDestinationConnector
5
- from unstructured_ingest.runner.writers.base_writer import Writer
6
-
7
- if t.TYPE_CHECKING:
8
- from unstructured_ingest.connector.azure_ai_search import (
9
- AzureAISearchWriteConfig,
10
- SimpleAzureAISearchStorageConfig,
11
- )
12
-
13
-
14
- @dataclass
15
- class AzureAiSearchWriter(Writer):
16
- connector_config: "SimpleAzureAISearchStorageConfig"
17
- write_config: "AzureAISearchWriteConfig"
18
-
19
- def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
20
- from unstructured_ingest.connector.azure_ai_search import (
21
- AzureAISearchDestinationConnector,
22
- )
23
-
24
- return AzureAISearchDestinationConnector
@@ -1,26 +0,0 @@
1
- import typing as t
2
- from abc import ABC, abstractmethod
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import (
6
- BaseConnectorConfig,
7
- BaseDestinationConnector,
8
- WriteConfig,
9
- )
10
-
11
-
12
- @dataclass
13
- class Writer(ABC):
14
- connector_config: BaseConnectorConfig
15
- write_config: WriteConfig
16
-
17
- @abstractmethod
18
- def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
19
- pass
20
-
21
- def get_connector(self, **kwargs) -> BaseDestinationConnector:
22
- connector_cls = self.get_connector_cls()
23
- return connector_cls(
24
- write_config=self.write_config,
25
- connector_config=self.connector_config,
26
- )
@@ -1,22 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
5
- from unstructured_ingest.interfaces import BaseDestinationConnector
6
- from unstructured_ingest.runner.writers.base_writer import Writer
7
-
8
- if t.TYPE_CHECKING:
9
- from unstructured_ingest.connector.chroma import ChromaWriteConfig, SimpleChromaConfig
10
-
11
-
12
- @dataclass
13
- class ChromaWriter(Writer, EnhancedDataClassJsonMixin):
14
- write_config: "ChromaWriteConfig"
15
- connector_config: "SimpleChromaConfig"
16
-
17
- def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
18
- from unstructured_ingest.connector.chroma import (
19
- ChromaDestinationConnector,
20
- )
21
-
22
- return ChromaDestinationConnector
@@ -1,19 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.interfaces import BaseDestinationConnector
5
- from unstructured_ingest.runner.writers.base_writer import Writer
6
-
7
- if t.TYPE_CHECKING:
8
- from unstructured_ingest.connector.clarifai import ClarifaiWriteConfig, SimpleClarifaiConfig
9
-
10
-
11
- @dataclass
12
- class ClarifaiWriter(Writer):
13
- write_config: "ClarifaiWriteConfig"
14
- connector_config: "SimpleClarifaiConfig"
15
-
16
- def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
17
- from unstructured_ingest.connector.clarifai import ClarifaiDestinationConnector
18
-
19
- return ClarifaiDestinationConnector