unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +49 -0
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/processes/connectors/github.py +221 -0
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,89 +0,0 @@
1
- import logging
2
- import typing as t
3
- from abc import ABC, abstractmethod
4
- from dataclasses import dataclass
5
-
6
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
7
- from unstructured_ingest.interfaces import (
8
- BaseConnectorConfig,
9
- BaseDestinationConnector,
10
- BaseSourceConnector,
11
- ChunkingConfig,
12
- EmbeddingConfig,
13
- PartitionConfig,
14
- PermissionsConfig,
15
- ProcessorConfig,
16
- ReadConfig,
17
- RetryStrategyConfig,
18
- )
19
- from unstructured_ingest.logger import ingest_log_streaming_init
20
- from unstructured_ingest.processor import process_documents
21
- from unstructured_ingest.runner.writers.base_writer import Writer
22
-
23
-
24
- @dataclass
25
- class Runner(EnhancedDataClassJsonMixin, ABC):
26
- connector_config: BaseConnectorConfig
27
- processor_config: ProcessorConfig
28
- read_config: ReadConfig
29
- partition_config: PartitionConfig
30
- writer: t.Optional[Writer] = None
31
- writer_kwargs: t.Optional[dict] = None
32
- embedding_config: t.Optional[EmbeddingConfig] = None
33
- chunking_config: t.Optional[ChunkingConfig] = None
34
- permissions_config: t.Optional[PermissionsConfig] = None
35
- retry_strategy_config: t.Optional[RetryStrategyConfig] = None
36
-
37
- def run(self, *args, **kwargs):
38
- ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO)
39
- self.update_read_config()
40
- source_connector = self.get_source_connector()
41
- self.process_documents(
42
- source_doc_connector=source_connector,
43
- )
44
-
45
- @abstractmethod
46
- def update_read_config(self):
47
- pass
48
-
49
- @abstractmethod
50
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
51
- pass
52
-
53
- def get_source_connector(self) -> BaseSourceConnector:
54
- source_connector_cls = self.get_source_connector_cls()
55
- return source_connector_cls(
56
- processor_config=self.processor_config,
57
- connector_config=self.connector_config,
58
- read_config=self.read_config,
59
- )
60
-
61
- def get_dest_doc_connector(self) -> t.Optional[BaseDestinationConnector]:
62
- writer_kwargs = self.writer_kwargs if self.writer_kwargs else {}
63
- if self.writer:
64
- return self.writer.get_connector(**writer_kwargs)
65
- return None
66
-
67
- def get_permissions_config(self) -> t.Optional[PermissionsConfig]:
68
- if self.permissions_config is None:
69
- return None
70
-
71
- permissions_config_filled = bool(
72
- self.permissions_config.application_id
73
- and self.permissions_config.client_cred
74
- and self.permissions_config.tenant,
75
- )
76
-
77
- return self.permissions_config if permissions_config_filled else None
78
-
79
- def process_documents(self, source_doc_connector: BaseSourceConnector):
80
- process_documents(
81
- processor_config=self.processor_config,
82
- source_doc_connector=source_doc_connector,
83
- partition_config=self.partition_config,
84
- dest_doc_connector=self.get_dest_doc_connector(),
85
- embedder_config=self.embedding_config,
86
- chunking_config=self.chunking_config,
87
- permissions_config=self.get_permissions_config(),
88
- retry_strategy_config=self.retry_strategy_config,
89
- )
@@ -1,45 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.biomed import SimpleBiomedConfig
12
-
13
-
14
- @dataclass
15
- class BiomedRunner(Runner):
16
- connector_config: "SimpleBiomedConfig"
17
-
18
- def update_read_config(self):
19
- base_path = (
20
- self.connector_config.path
21
- if self.connector_config.path
22
- else "{}-{}-{}".format(
23
- self.connector_config.api_id if self.connector_config.api_id else "",
24
- self.connector_config.api_from if self.connector_config.api_from else "",
25
- self.connector_config.api_until if self.connector_config.api_until else "",
26
- )
27
- )
28
-
29
- hashed_dir_name = hashlib.sha256(
30
- base_path.encode("utf-8"),
31
- )
32
-
33
- self.read_config.download_dir = update_download_dir_hash(
34
- connector_name="biomed",
35
- read_config=self.read_config,
36
- hashed_dir_name=hashed_dir_name,
37
- logger=logger,
38
- )
39
-
40
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
41
- from unstructured_ingest.connector.biomed import (
42
- BiomedSourceConnector,
43
- )
44
-
45
- return BiomedSourceConnector
@@ -1,35 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.confluence import SimpleConfluenceConfig
12
-
13
-
14
- @dataclass
15
- class ConfluenceRunner(Runner):
16
- connector_config: "SimpleConfluenceConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- self.connector_config.url.encode("utf-8"),
21
- )
22
-
23
- self.read_config.download_dir = update_download_dir_hash(
24
- connector_name="confluence",
25
- read_config=self.read_config,
26
- hashed_dir_name=hashed_dir_name,
27
- logger=logger,
28
- )
29
-
30
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
- from unstructured_ingest.connector.confluence import (
32
- ConfluenceSourceConnector,
33
- )
34
-
35
- return ConfluenceSourceConnector
@@ -1,34 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.delta_table import SimpleDeltaTableConfig
12
-
13
-
14
- @dataclass
15
- class DeltaTableRunner(Runner):
16
- connector_config: "SimpleDeltaTableConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- str(self.connector_config.table_uri).encode("utf-8"),
21
- )
22
- self.read_config.download_dir = update_download_dir_hash(
23
- connector_name="delta_table",
24
- read_config=self.read_config,
25
- hashed_dir_name=hashed_dir_name,
26
- logger=logger,
27
- )
28
-
29
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
30
- from unstructured_ingest.connector.delta_table import (
31
- DeltaTableSourceConnector,
32
- )
33
-
34
- return DeltaTableSourceConnector
@@ -1,35 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.discord import SimpleDiscordConfig
12
-
13
-
14
- @dataclass
15
- class DiscordRunner(Runner):
16
- connector_config: "SimpleDiscordConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- ",".join(self.connector_config.channels).encode("utf-8"),
21
- )
22
-
23
- self.read_config.download_dir = update_download_dir_hash(
24
- connector_name="discord",
25
- read_config=self.read_config,
26
- hashed_dir_name=hashed_dir_name,
27
- logger=logger,
28
- )
29
-
30
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
- from unstructured_ingest.connector.discord import (
32
- DiscordSourceConnector,
33
- )
34
-
35
- return DiscordSourceConnector
@@ -1,40 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.elasticsearch import SimpleElasticsearchConfig
12
-
13
-
14
- @dataclass
15
- class ElasticSearchRunner(Runner):
16
- connector_config: "SimpleElasticsearchConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- "{}_{}".format(
21
- ",".join(self.connector_config.access_config.hosts),
22
- self.connector_config.index_name,
23
- ).encode(
24
- "utf-8",
25
- ),
26
- )
27
-
28
- self.read_config.download_dir = update_download_dir_hash(
29
- connector_name="elasticsearch",
30
- read_config=self.read_config,
31
- hashed_dir_name=hashed_dir_name,
32
- logger=logger,
33
- )
34
-
35
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
36
- from unstructured_ingest.connector.elasticsearch import (
37
- ElasticsearchSourceConnector,
38
- )
39
-
40
- return ElasticsearchSourceConnector
@@ -1,30 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.interfaces import BaseSourceConnector
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.runner.base_runner import Runner
7
- from unstructured_ingest.runner.utils import update_download_dir_remote_url
8
-
9
- if t.TYPE_CHECKING:
10
- from unstructured_ingest.connector.fsspec.azure import SimpleAzureBlobStorageConfig
11
-
12
-
13
- @dataclass
14
- class AzureRunner(Runner):
15
- connector_config: "SimpleAzureBlobStorageConfig"
16
-
17
- def update_read_config(self):
18
- self.read_config.download_dir = update_download_dir_remote_url(
19
- connector_name="azure",
20
- read_config=self.read_config,
21
- remote_url=self.connector_config.remote_url, # type: ignore
22
- logger=logger,
23
- )
24
-
25
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
26
- from unstructured_ingest.connector.fsspec.azure import (
27
- AzureBlobStorageSourceConnector,
28
- )
29
-
30
- return AzureBlobStorageSourceConnector
@@ -1,28 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.interfaces import BaseSourceConnector
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.runner.base_runner import Runner
7
- from unstructured_ingest.runner.utils import update_download_dir_remote_url
8
-
9
- if t.TYPE_CHECKING:
10
- from unstructured_ingest.connector.fsspec.box import SimpleBoxConfig
11
-
12
-
13
- @dataclass
14
- class BoxRunner(Runner):
15
- connector_config: "SimpleBoxConfig"
16
-
17
- def update_read_config(self):
18
- self.read_config.download_dir = update_download_dir_remote_url(
19
- connector_name="box",
20
- read_config=self.read_config,
21
- remote_url=self.connector_config.remote_url, # type: ignore
22
- logger=logger,
23
- )
24
-
25
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
26
- from unstructured_ingest.connector.fsspec.box import BoxSourceConnector
27
-
28
- return BoxSourceConnector
@@ -1,30 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.interfaces import BaseSourceConnector
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.runner.base_runner import Runner
7
- from unstructured_ingest.runner.utils import update_download_dir_remote_url
8
-
9
- if t.TYPE_CHECKING:
10
- from unstructured_ingest.connector.fsspec.dropbox import SimpleDropboxConfig
11
-
12
-
13
- @dataclass
14
- class DropboxRunner(Runner):
15
- connector_config: "SimpleDropboxConfig"
16
-
17
- def update_read_config(self):
18
- self.read_config.download_dir = update_download_dir_remote_url(
19
- connector_name="dropbox",
20
- read_config=self.read_config,
21
- remote_url=self.connector_config.remote_url, # type: ignore
22
- logger=logger,
23
- )
24
-
25
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
26
- from unstructured_ingest.connector.fsspec.dropbox import (
27
- DropboxSourceConnector,
28
- )
29
-
30
- return DropboxSourceConnector
@@ -1,40 +0,0 @@
1
- import typing as t
2
- import warnings
3
- from dataclasses import dataclass
4
- from urllib.parse import urlparse
5
-
6
- from unstructured_ingest.interfaces import BaseSourceConnector
7
- from unstructured_ingest.logger import logger
8
- from unstructured_ingest.runner.base_runner import Runner
9
- from unstructured_ingest.runner.utils import update_download_dir_remote_url
10
-
11
- if t.TYPE_CHECKING:
12
- from unstructured_ingest.connector.fsspec.fsspec import SimpleFsspecConfig
13
-
14
-
15
- @dataclass
16
- class FsspecRunner(Runner):
17
- connector_config: "SimpleFsspecConfig"
18
-
19
- def update_read_config(self):
20
- self.read_config.download_dir = update_download_dir_remote_url(
21
- connector_name="fsspec",
22
- read_config=self.read_config,
23
- remote_url=self.fsspec_config.remote_url, # type: ignore
24
- logger=logger,
25
- )
26
-
27
- protocol = urlparse(self.fsspec_config.remote_url).scheme # type: ignore
28
- warnings.warn(
29
- f"`fsspec` protocol {protocol} is not directly supported by `unstructured`,"
30
- " so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`,"
31
- "`dropbox`, `abfs`, `az` and `sftp`.",
32
- UserWarning,
33
- )
34
-
35
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
36
- from unstructured_ingest.connector.fsspec.fsspec import (
37
- FsspecSourceConnector,
38
- )
39
-
40
- return FsspecSourceConnector
@@ -1,28 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.interfaces import BaseSourceConnector
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.runner.base_runner import Runner
7
- from unstructured_ingest.runner.utils import update_download_dir_remote_url
8
-
9
- if t.TYPE_CHECKING:
10
- from unstructured_ingest.connector.fsspec.gcs import SimpleGcsConfig
11
-
12
-
13
- @dataclass
14
- class GCSRunner(Runner):
15
- connector_config: "SimpleGcsConfig"
16
-
17
- def update_read_config(self):
18
- self.read_config.download_dir = update_download_dir_remote_url(
19
- connector_name="gcs",
20
- read_config=self.read_config,
21
- remote_url=self.connector_config.remote_url, # type: ignore
22
- logger=logger,
23
- )
24
-
25
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
26
- from unstructured_ingest.connector.fsspec.gcs import GcsSourceConnector
27
-
28
- return GcsSourceConnector
@@ -1,28 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.interfaces import BaseSourceConnector
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.runner.base_runner import Runner
7
- from unstructured_ingest.runner.utils import update_download_dir_remote_url
8
-
9
- if t.TYPE_CHECKING:
10
- from unstructured_ingest.connector.fsspec.s3 import SimpleS3Config
11
-
12
-
13
- @dataclass
14
- class S3Runner(Runner):
15
- connector_config: "SimpleS3Config"
16
-
17
- def update_read_config(self):
18
- self.read_config.download_dir = update_download_dir_remote_url(
19
- connector_name="s3",
20
- read_config=self.read_config,
21
- remote_url=self.connector_config.remote_url, # type: ignore
22
- logger=logger,
23
- )
24
-
25
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
26
- from unstructured_ingest.connector.fsspec.s3 import S3SourceConnector
27
-
28
- return S3SourceConnector
@@ -1,28 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass
3
-
4
- from unstructured_ingest.interfaces import BaseSourceConnector
5
- from unstructured_ingest.logger import logger
6
- from unstructured_ingest.runner.base_runner import Runner
7
- from unstructured_ingest.runner.utils import update_download_dir_remote_url
8
-
9
- if t.TYPE_CHECKING:
10
- from unstructured_ingest.connector.fsspec.sftp import SimpleSftpConfig
11
-
12
-
13
- @dataclass
14
- class SftpRunner(Runner):
15
- connector_config: "SimpleSftpConfig"
16
-
17
- def update_read_config(self):
18
- self.read_config.download_dir = update_download_dir_remote_url(
19
- connector_name="sftp",
20
- read_config=self.read_config,
21
- remote_url=self.connector_config.remote_url, # type: ignore
22
- logger=logger,
23
- )
24
-
25
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
26
- from unstructured_ingest.connector.fsspec.sftp import SftpSourceConnector
27
-
28
- return SftpSourceConnector
@@ -1,37 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.github import SimpleGitHubConfig
12
-
13
-
14
- @dataclass
15
- class GithubRunner(Runner):
16
- connector_config: "SimpleGitHubConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- f"{self.connector_config.url}_{self.connector_config.branch}".encode(
21
- "utf-8",
22
- ),
23
- )
24
-
25
- self.read_config.download_dir = update_download_dir_hash(
26
- connector_name="github",
27
- read_config=self.read_config,
28
- hashed_dir_name=hashed_dir_name,
29
- logger=logger,
30
- )
31
-
32
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
33
- from unstructured_ingest.connector.github import (
34
- GitHubSourceConnector,
35
- )
36
-
37
- return GitHubSourceConnector
@@ -1,37 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.gitlab import SimpleGitlabConfig
12
-
13
-
14
- @dataclass
15
- class GitlabRunner(Runner):
16
- connector_config: "SimpleGitlabConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- f"{self.connector_config.url}_{self.connector_config.branch}".encode(
21
- "utf-8",
22
- ),
23
- )
24
-
25
- self.read_config.download_dir = update_download_dir_hash(
26
- connector_name="gitlab",
27
- read_config=self.read_config,
28
- hashed_dir_name=hashed_dir_name,
29
- logger=logger,
30
- )
31
-
32
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
33
- from unstructured_ingest.connector.gitlab import (
34
- GitLabSourceConnector,
35
- )
36
-
37
- return GitLabSourceConnector
@@ -1,35 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.google_drive import SimpleGoogleDriveConfig
12
-
13
-
14
- @dataclass
15
- class GoogleDriveRunner(Runner):
16
- connector_config: "SimpleGoogleDriveConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- self.connector_config.drive_id.encode("utf-8"),
21
- )
22
-
23
- self.read_config.download_dir = update_download_dir_hash(
24
- connector_name="google_drive",
25
- read_config=self.read_config,
26
- hashed_dir_name=hashed_dir_name,
27
- logger=logger,
28
- )
29
-
30
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
- from unstructured_ingest.connector.google_drive import (
32
- GoogleDriveSourceConnector,
33
- )
34
-
35
- return GoogleDriveSourceConnector
@@ -1,35 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.hubspot import SimpleHubSpotConfig
12
-
13
-
14
- @dataclass
15
- class HubSpotRunner(Runner):
16
- connector_config: "SimpleHubSpotConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- self.connector_config.access_config.api_token.encode("utf-8"),
21
- )
22
-
23
- self.read_config.download_dir = update_download_dir_hash(
24
- connector_name="hubspot",
25
- read_config=self.read_config,
26
- hashed_dir_name=hashed_dir_name,
27
- logger=logger,
28
- )
29
-
30
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
- from unstructured_ingest.connector.hubspot import (
32
- HubSpotSourceConnector,
33
- )
34
-
35
- return HubSpotSourceConnector
@@ -1,35 +0,0 @@
1
- import hashlib
2
- import typing as t
3
- from dataclasses import dataclass
4
-
5
- from unstructured_ingest.interfaces import BaseSourceConnector
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.runner.base_runner import Runner
8
- from unstructured_ingest.runner.utils import update_download_dir_hash
9
-
10
- if t.TYPE_CHECKING:
11
- from unstructured_ingest.connector.jira import SimpleJiraConfig
12
-
13
-
14
- @dataclass
15
- class JiraRunner(Runner):
16
- connector_config: "SimpleJiraConfig"
17
-
18
- def update_read_config(self):
19
- hashed_dir_name = hashlib.sha256(
20
- self.connector_config.url.encode("utf-8"),
21
- )
22
-
23
- self.read_config.download_dir = update_download_dir_hash(
24
- connector_name="jira",
25
- read_config=self.read_config,
26
- hashed_dir_name=hashed_dir_name,
27
- logger=logger,
28
- )
29
-
30
- def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
31
- from unstructured_ingest.connector.jira import (
32
- JiraSourceConnector,
33
- )
34
-
35
- return JiraSourceConnector