unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -8,9 +8,7 @@ from unstructured_ingest.embed.interfaces import (
8
8
  BaseEmbeddingEncoder,
9
9
  EmbeddingConfig,
10
10
  )
11
- from unstructured_ingest.logger import logger
12
- from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.v2.errors import (
11
+ from unstructured_ingest.errors_v2 import (
14
12
  ProviderError,
15
13
  QuotaError,
16
14
  RateLimitError,
@@ -18,6 +16,8 @@ from unstructured_ingest.v2.errors import (
18
16
  UserError,
19
17
  is_internal_error,
20
18
  )
19
+ from unstructured_ingest.logger import logger
20
+ from unstructured_ingest.utils.dep_check import requires_dependencies
21
21
 
22
22
  if TYPE_CHECKING:
23
23
  from openai import AsyncOpenAI, OpenAI
@@ -8,9 +8,7 @@ from unstructured_ingest.embed.interfaces import (
8
8
  BaseEmbeddingEncoder,
9
9
  EmbeddingConfig,
10
10
  )
11
- from unstructured_ingest.logger import logger
12
- from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.v2.errors import (
11
+ from unstructured_ingest.errors_v2 import (
14
12
  ProviderError,
15
13
  QuotaError,
16
14
  RateLimitError,
@@ -18,6 +16,8 @@ from unstructured_ingest.v2.errors import (
18
16
  UserError,
19
17
  is_internal_error,
20
18
  )
19
+ from unstructured_ingest.logger import logger
20
+ from unstructured_ingest.utils.dep_check import requires_dependencies
21
21
 
22
22
  if TYPE_CHECKING:
23
23
  from openai import AsyncOpenAI, OpenAI
@@ -8,12 +8,12 @@ from unstructured_ingest.embed.interfaces import (
8
8
  BaseEmbeddingEncoder,
9
9
  EmbeddingConfig,
10
10
  )
11
- from unstructured_ingest.logger import logger
12
- from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.v2.errors import (
11
+ from unstructured_ingest.errors_v2 import (
14
12
  RateLimitError as CustomRateLimitError,
15
13
  )
16
- from unstructured_ingest.v2.errors import UserAuthError, UserError, is_internal_error
14
+ from unstructured_ingest.errors_v2 import UserAuthError, UserError, is_internal_error
15
+ from unstructured_ingest.logger import logger
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
17
 
18
18
  if TYPE_CHECKING:
19
19
  from together import AsyncTogether, Together
@@ -13,8 +13,8 @@ from unstructured_ingest.embed.interfaces import (
13
13
  BaseEmbeddingEncoder,
14
14
  EmbeddingConfig,
15
15
  )
16
+ from unstructured_ingest.errors_v2 import UserAuthError, is_internal_error
16
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
17
- from unstructured_ingest.v2.errors import UserAuthError, is_internal_error
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  from vertexai.language_models import TextEmbeddingModel
@@ -8,12 +8,12 @@ from unstructured_ingest.embed.interfaces import (
8
8
  BaseEmbeddingEncoder,
9
9
  EmbeddingConfig,
10
10
  )
11
- from unstructured_ingest.logger import logger
12
- from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError, is_internal_error
14
- from unstructured_ingest.v2.errors import (
11
+ from unstructured_ingest.errors_v2 import ProviderError, UserAuthError, UserError, is_internal_error
12
+ from unstructured_ingest.errors_v2 import (
15
13
  RateLimitError as CustomRateLimitError,
16
14
  )
15
+ from unstructured_ingest.logger import logger
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
17
 
18
18
  if TYPE_CHECKING:
19
19
  from voyageai import AsyncClient as AsyncVoyageAIClient
@@ -5,9 +5,9 @@ from typing import Any, Optional, TypedDict, TypeVar, Union
5
5
 
6
6
  from pydantic import BaseModel, Field
7
7
 
8
- from unstructured_ingest.v2.interfaces.connector import BaseConnector
9
- from unstructured_ingest.v2.interfaces.process import BaseProcess
10
- from unstructured_ingest.v2.types.file_data import FileData
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.interfaces.connector import BaseConnector
10
+ from unstructured_ingest.interfaces.process import BaseProcess
11
11
 
12
12
 
13
13
  class DownloaderConfig(BaseModel):
@@ -3,9 +3,9 @@ from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
3
3
 
4
4
  from pydantic import BaseModel
5
5
 
6
- from unstructured_ingest.v2.interfaces.connector import BaseConnector
7
- from unstructured_ingest.v2.interfaces.process import BaseProcess
8
- from unstructured_ingest.v2.types.file_data import FileData
6
+ from unstructured_ingest.data_types.file_data import FileData
7
+ from unstructured_ingest.interfaces.connector import BaseConnector
8
+ from unstructured_ingest.interfaces.process import BaseProcess
9
9
 
10
10
 
11
11
  class IndexerConfig(BaseModel):
@@ -5,10 +5,10 @@ from typing import Any, TypeVar
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.interfaces import BaseProcess
8
10
  from unstructured_ingest.utils import ndjson
9
11
  from unstructured_ingest.utils.data_prep import get_data, write_data
10
- from unstructured_ingest.v2.interfaces import BaseProcess
11
- from unstructured_ingest.v2.types.file_data import FileData
12
12
 
13
13
 
14
14
  class UploadStagerConfig(BaseModel):
@@ -5,9 +5,9 @@ from typing import Any, TypeVar
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.interfaces import BaseConnector, BaseProcess
8
10
  from unstructured_ingest.utils.data_prep import get_data
9
- from unstructured_ingest.v2.interfaces import BaseConnector, BaseProcess
10
- from unstructured_ingest.v2.types.file_data import FileData
11
11
 
12
12
 
13
13
  class UploaderConfig(BaseModel):
@@ -13,7 +13,7 @@ from opentelemetry.sdk.trace.export import (
13
13
  SpanExportResult,
14
14
  )
15
15
 
16
- from unstructured_ingest.v2.logger import logger
16
+ from unstructured_ingest.logger import logger
17
17
 
18
18
 
19
19
  class AddTraceCallable(Protocol):
@@ -1,22 +0,0 @@
1
- from .doc_factory import DocFactory
2
- from .interfaces import PipelineContext, ReformatNode
3
- from .partition import Partitioner
4
- from .permissions import PermissionsDataCleaner
5
- from .pipeline import Pipeline
6
- from .reformat.chunking import Chunker
7
- from .reformat.embedding import Embedder
8
- from .source import Reader
9
- from .write import Writer
10
-
11
- __all__ = [
12
- "DocFactory",
13
- "Partitioner",
14
- "Reader",
15
- "Embedder",
16
- "PipelineContext",
17
- "Pipeline",
18
- "Writer",
19
- "Chunker",
20
- "ReformatNode",
21
- "PermissionsDataCleaner",
22
- ]
@@ -1,270 +1,211 @@
1
- import hashlib
2
- import json
1
+ from __future__ import annotations
2
+
3
+ import asyncio
3
4
  import logging
4
5
  import multiprocessing as mp
5
- import typing as t
6
+ import shutil
6
7
  from abc import ABC, abstractmethod
7
- from dataclasses import dataclass, field
8
- from multiprocessing.managers import DictProxy
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from dataclasses import dataclass
9
10
  from pathlib import Path
11
+ from typing import Any, Awaitable, Callable, Optional, TypeVar
10
12
 
11
- from dataclasses_json import DataClassJsonMixin
13
+ from tqdm import tqdm
14
+ from tqdm.asyncio import tqdm as tqdm_asyncio
12
15
 
13
- from unstructured_ingest.error import SourceConnectionNetworkError
14
- from unstructured_ingest.interfaces import (
15
- BaseDestinationConnector,
16
- BaseSourceConnector,
17
- PartitionConfig,
18
- ProcessorConfig,
19
- ReadConfig,
20
- RetryStrategyConfig,
21
- )
22
- from unstructured_ingest.logger import ingest_log_streaming_init, logger
16
+ from unstructured_ingest.interfaces import BaseProcess, ProcessorConfig, Uploader
17
+ from unstructured_ingest.logger import logger, make_default_logger
18
+ from unstructured_ingest.otel import OtelHandler
19
+ from unstructured_ingest.pipeline.otel import instrument
23
20
 
24
- if t.TYPE_CHECKING:
25
- from unstructured_ingest.ingest_backoff import RetryHandler
21
+ BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess)
22
+ iterable_input = list[dict[str, Any]]
26
23
 
27
24
 
28
25
  @dataclass
29
- class PipelineContext(ProcessorConfig):
30
- """
31
- Data that gets shared across each pipeline node
32
- """
33
-
34
- def __post_init__(self):
35
- self._ingest_docs_map: t.Optional[DictProxy] = None
26
+ class PipelineStep(ABC):
27
+ process: BaseProcessT
28
+ context: ProcessorConfig
29
+ identifier: str
36
30
 
37
- @property
38
- def ingest_docs_map(self) -> DictProxy:
39
- if self._ingest_docs_map is None:
40
- raise ValueError("ingest_docs_map never initialized")
41
- return self._ingest_docs_map
31
+ def __str__(self):
32
+ return self.identifier
42
33
 
43
- @ingest_docs_map.setter
44
- def ingest_docs_map(self, value: DictProxy):
45
- self._ingest_docs_map = value
34
+ def process_serially(self, iterable: iterable_input) -> Any:
35
+ logger.info("processing content serially")
36
+ if iterable:
37
+ if len(iterable) == 1:
38
+ return [self.run(**iterable[0])]
39
+ if self.context.tqdm:
40
+ return [self.run(**it) for it in tqdm(iterable, desc=self.identifier)]
41
+ return [self.run(**it) for it in iterable]
42
+ return [self.run()]
43
+
44
+ async def _process_async(self, iterable: iterable_input) -> Any:
45
+ if iterable:
46
+ if len(iterable) == 1:
47
+ return [await self.run_async(**iterable[0])]
48
+ if self.context.tqdm:
49
+ return await tqdm_asyncio.gather(
50
+ *[self.run_async(**i) for i in iterable], desc=self.identifier
51
+ )
52
+ return await asyncio.gather(*[self.run_async(**i) for i in iterable])
53
+ return [await self.run_async()]
54
+
55
+ def process_async(self, iterable: iterable_input) -> Any:
56
+ logger.info("processing content async")
57
+ return self.asyncio_run(fn=self._process_async, iterable=iterable)
58
+
59
+ def asyncio_run(
60
+ self, fn: Callable[[Any, Any], Awaitable[Any]], *args: Any, **kwargs: Any
61
+ ) -> Any:
62
+ current_loop = asyncio._get_running_loop()
63
+ if current_loop is None:
64
+ return asyncio.run(fn(*args, **kwargs))
65
+ with ThreadPoolExecutor(thread_name_prefix="asyncio") as thread_pool:
66
+ logger.warning(
67
+ f"async code being run in dedicated thread pool "
68
+ f"to not conflict with existing event loop: {current_loop}"
69
+ )
46
70
 
71
+ def wrapped():
72
+ return asyncio.run(fn(*args, **kwargs))
47
73
 
48
- @dataclass
49
- class PipelineNode(DataClassJsonMixin, ABC):
50
- """
51
- Class that encapsulates logic to run during a single pipeline step
52
- """
74
+ future = thread_pool.submit(wrapped)
75
+ return future.result()
53
76
 
54
- pipeline_context: PipelineContext
77
+ def process_multiprocess(self, iterable: iterable_input) -> Any:
78
+ logger.info("processing content across processes")
55
79
 
56
- def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any:
57
- iterable = iterable if iterable else []
80
+ if iterable:
81
+ if len(iterable) == 1:
82
+ return self.process_serially(iterable)
83
+ if self.context.num_processes == 1:
84
+ return self.process_serially(iterable)
85
+ with mp.Pool(
86
+ processes=self.context.num_processes,
87
+ initializer=self._init_mp,
88
+ initargs=(
89
+ logging.DEBUG if self.context.verbose else logging.INFO,
90
+ self.context.otel_endpoint,
91
+ ),
92
+ ) as pool:
93
+ otel_context = OtelHandler.inject_context()
94
+ for iter in iterable:
95
+ iter[OtelHandler.trace_context_key] = otel_context
96
+ if self.context.tqdm:
97
+ return list(
98
+ tqdm(
99
+ pool.imap_unordered(func=self._wrap_mp, iterable=iterable),
100
+ total=len(iterable),
101
+ desc=self.identifier,
102
+ )
103
+ )
104
+ return pool.map(self._wrap_mp, iterable)
105
+ return [self.run()]
106
+
107
+ def _wrap_mp(self, input_kwargs: dict) -> Any:
108
+ # Allow mapping of kwargs via multiprocessing map()
109
+ return self.run(**input_kwargs)
110
+
111
+ def _init_mp(self, log_level: int, endpoint: Optional[str] = None) -> None:
112
+ # Init logger for each spawned process when using multiprocessing pool
113
+ make_default_logger(level=log_level)
114
+ otel_handler = OtelHandler(otel_endpoint=endpoint, log_out=logger.debug)
115
+ otel_handler.init_trace()
116
+
117
+ @instrument()
118
+ def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
119
+ iterable = iterable or []
58
120
  if iterable:
59
121
  logger.info(
60
122
  f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
61
123
  )
62
-
63
- self.initialize()
64
- if not self.supported_multiprocessing():
65
- if iterable:
66
- self.result = self.run(iterable)
67
- else:
68
- self.result = self.run()
69
- elif self.pipeline_context.num_processes == 1:
70
- if iterable:
71
- self.result = [self.run(it) for it in iterable]
72
- else:
73
- self.result = self.run()
74
124
  else:
75
- with mp.Pool(
76
- processes=self.pipeline_context.num_processes,
77
- initializer=ingest_log_streaming_init,
78
- initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,),
79
- ) as pool:
80
- self.result = pool.map(self.run, iterable)
81
- # Remove None which may be caused by failed docs that didn't raise an error
82
- if isinstance(self.result, t.Iterable):
83
- self.result = [r for r in self.result if r is not None]
84
- return self.result
85
-
86
- def supported_multiprocessing(self) -> bool:
87
- return True
88
-
89
- @abstractmethod
90
- def run(self, *args, **kwargs) -> t.Optional[t.Any]:
91
- pass
92
-
93
- def initialize(self):
94
- if path := self.get_path():
95
- logger.info(f"creating {path}")
96
- path.mkdir(parents=True, exist_ok=True)
97
- ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
98
-
99
- def get_path(self) -> t.Optional[Path]:
100
- return None
101
-
102
-
103
- @dataclass
104
- class DocFactoryNode(PipelineNode):
105
- """
106
- Encapsulated logic to generate a list of ingest docs
107
- """
108
-
109
- source_doc_connector: BaseSourceConnector
110
-
111
- def initialize(self):
112
- logger.info(
113
- f"Running doc factory to generate ingest docs. "
114
- f"Source connector: {self.source_doc_connector.to_json()}",
115
- )
116
- super().initialize()
117
- self.source_doc_connector.initialize()
118
-
119
- @abstractmethod
120
- def run(self, *args, **kwargs) -> t.Iterable[dict]:
121
- pass
122
-
123
- def supported_multiprocessing(self) -> bool:
124
- return False
125
-
126
-
127
- @dataclass
128
- class SourceNode(PipelineNode):
129
- """A pipeline node representing logic to pull data from a source using base ingest documents.
130
-
131
- This class encapsulates the logic for pulling data from a specified source using base ingest
132
- documents. The output of this logic is expected to be in JSON format representing the data
133
- itself.
134
-
135
- Attributes:
136
- read_config: A configuration object specifying how to read data from the source.
137
- retry_strategy_config: Optional configuration specifying the strategy for network errors.
138
-
139
- Properties:
140
- retry_strategy: A retry handler configured based on the retry strategy configuration.
141
-
142
- Methods:
143
- initialize: Initializes the source node and logs the process.
144
- run: Abstract method for downloading data associated with ingest documents.
145
- """
146
-
147
- read_config: ReadConfig
148
- retry_strategy_config: t.Optional[RetryStrategyConfig] = None
125
+ logger.info(f"calling {self.__class__.__name__} with no inputs")
126
+ if self.context.async_supported and self.process.is_async():
127
+ return self.process_async(iterable=iterable)
128
+ if self.context.mp_supported:
129
+ return self.process_multiprocess(iterable=iterable)
130
+ return self.process_serially(iterable=iterable)
131
+
132
+ def _run(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
133
+ return self.asyncio_run(fn=self.run_async, _fn=fn, **kwargs)
134
+
135
+ async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
136
+ raise NotImplementedError
137
+
138
+ def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
139
+ kwargs = kwargs.copy()
140
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
141
+ tracer = otel_handler.get_tracer()
142
+ if trace_context := kwargs.pop(otel_handler.trace_context_key, {}):
143
+ otel_handler.attach_context(trace_context=trace_context)
144
+ attributes = {}
145
+ if file_data_path := kwargs.get("file_data_path"):
146
+ attributes["file_id"] = Path(file_data_path).stem
147
+ try:
148
+ with tracer.start_as_current_span(self.identifier, record_exception=True) as span:
149
+ otel_handler.set_attributes(span, attributes)
150
+ fn = _fn or self.process.run
151
+ return self._run(fn=fn, **kwargs)
152
+ except Exception as e:
153
+ logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
154
+ if "file_data_path" in kwargs:
155
+ self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
156
+ if self.context.raise_on_error:
157
+ raise e
158
+ return None
159
+
160
+ async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
161
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
162
+ try:
163
+ attributes = {}
164
+ if file_data_path := kwargs.get("file_data_path"):
165
+ attributes["file_id"] = Path(file_data_path).stem
166
+ with otel_handler.get_tracer().start_as_current_span(
167
+ self.identifier, record_exception=True
168
+ ) as span:
169
+ otel_handler.set_attributes(span, attributes)
170
+ fn = _fn or self.process.run_async
171
+ return await self._run_async(fn=fn, **kwargs)
172
+ except Exception as e:
173
+ logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
174
+ if "file_data_path" in kwargs:
175
+ self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
176
+ if self.context.raise_on_error:
177
+ raise e
178
+ return None
149
179
 
150
180
  @property
151
- def retry_strategy(self) -> t.Optional["RetryHandler"]:
152
- if retry_strategy_config := self.retry_strategy_config:
153
- import backoff
181
+ def cache_dir(self) -> Path:
182
+ return Path(self.context.work_dir) / self.identifier
154
183
 
155
- from unstructured_ingest.ingest_backoff import RetryHandler
156
-
157
- return RetryHandler(
158
- backoff.expo,
159
- SourceConnectionNetworkError,
160
- max_time=retry_strategy_config.max_retry_time,
161
- max_tries=retry_strategy_config.max_retries,
162
- logger=logger,
163
- start_log_level=logger.level,
164
- backoff_log_level=logger.level,
165
- )
166
- return None
167
-
168
- def initialize(self):
169
- logger.info("Running source node to download data associated with ingest docs")
170
- super().initialize()
171
-
172
- @abstractmethod
173
- def run(self, ingest_doc_json: str) -> t.Optional[str]:
174
- pass
175
-
176
-
177
- @dataclass
178
- class PartitionNode(PipelineNode):
179
- """
180
- Encapsulates logic to run partition on the json files as the output of the source node
181
- """
182
-
183
- partition_config: PartitionConfig
184
- partition_kwargs: dict = field(default_factory=dict)
185
-
186
- def initialize(self):
187
- logger.info(
188
- f"Running partition node to extract content from json files. "
189
- f"Config: {self.partition_config.to_json()}, "
190
- f"partition kwargs: {json.dumps(self.partition_kwargs)}]",
191
- )
192
- super().initialize()
193
-
194
- def create_hash(self) -> str:
195
- hash_dict = self.partition_config.to_dict()
196
- hash_dict["partition_kwargs"] = self.partition_kwargs
197
- return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
198
-
199
- @abstractmethod
200
- def run(self, json_path: str) -> t.Optional[str]:
201
- pass
202
-
203
- def get_path(self) -> Path:
204
- return (Path(self.pipeline_context.work_dir) / "partitioned").resolve()
205
-
206
-
207
- @dataclass
208
- class ReformatNode(PipelineNode, ABC):
209
- """
210
- Encapsulated any logic to reformat the output List[Element]
211
- content from partition before writing it
212
- """
213
-
214
- @abstractmethod
215
- def run(self, elements_json: str) -> t.Optional[str]:
216
- pass
217
-
218
-
219
- @dataclass
220
- class WriteNode(PipelineNode):
221
- """
222
- Encapsulated logic to write the final result to a downstream data connection
223
- """
224
-
225
- dest_doc_connector: BaseDestinationConnector
226
-
227
- @abstractmethod
228
- def run(self, json_paths: t.List[str]):
229
- pass
230
-
231
- def initialize(self):
232
- logger.info(
233
- f"Running write node to upload content. "
234
- f"Destination connector: {self.dest_doc_connector.to_json(redact_sensitive=True)}]",
235
- )
236
- super().initialize()
237
- self.dest_doc_connector.initialize()
238
-
239
- def supported_multiprocessing(self) -> bool:
240
- return False
184
+ def delete_cache(self):
185
+ if self.context.iter_delete and self.cache_dir.exists():
186
+ cache_dir = self.cache_dir
187
+ logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
188
+ shutil.rmtree(cache_dir)
241
189
 
242
190
 
243
191
  @dataclass
244
- class CopyNode(PipelineNode):
245
- """
246
- Encapsulated logic to copy the final result of the pipeline to the designated output location.
247
- """
192
+ class BatchPipelineStep(PipelineStep, ABC):
193
+ process: Uploader
248
194
 
249
- def initialize(self):
250
- logger.info("Running copy node to move content to desired output location")
251
- super().initialize()
195
+ def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
196
+ if self.context.mp_supported and self.process.is_batch():
197
+ return self.run_batch(contents=iterable)
198
+ super().__call__(iterable=iterable)
252
199
 
253
200
  @abstractmethod
254
- def run(self, json_path: str):
201
+ def _run_batch(self, contents: iterable_input, **kwargs) -> Any:
255
202
  pass
256
203
 
257
-
258
- @dataclass
259
- class PermissionsNode(PipelineNode):
260
- """
261
- Encapsulated logic to do operations on permissions related data.
262
- """
263
-
264
- def initialize(self):
265
- logger.info("Running permissions node to cleanup the permissions folder")
266
- super().initialize()
267
-
268
- @abstractmethod
269
- def run(self):
270
- pass
204
+ def run_batch(self, contents: iterable_input, **kwargs) -> Any:
205
+ try:
206
+ return self._run_batch(contents=contents, **kwargs)
207
+ except Exception as e:
208
+ self.context.status[self.identifier] = {"step_error": str(e)}
209
+ if self.context.raise_on_error:
210
+ raise e
211
+ return None
@@ -1,8 +1,8 @@
1
1
  from functools import wraps
2
2
  from typing import Callable, Optional
3
3
 
4
- from unstructured_ingest.v2.logger import logger
5
- from unstructured_ingest.v2.otel import OtelHandler
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.otel import OtelHandler
6
6
 
7
7
 
8
8
  def instrument(