unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,117 +1,408 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
1
4
  import logging
2
5
  import multiprocessing as mp
3
- from dataclasses import dataclass, field
4
- from typing import Any, Optional
5
-
6
- from dataclasses_json import DataClassJsonMixin
7
-
8
- from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
9
- from unstructured_ingest.interfaces import BaseIngestDocBatch, BaseSingleIngestDoc
10
- from unstructured_ingest.logger import ingest_log_streaming_init, logger
11
- from unstructured_ingest.pipeline.copy import Copier
12
- from unstructured_ingest.pipeline.interfaces import (
13
- DocFactoryNode,
14
- PartitionNode,
15
- PipelineContext,
16
- ReformatNode,
17
- SourceNode,
18
- WriteNode,
6
+ import shutil
7
+ from dataclasses import InitVar, dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from unstructured_ingest.interfaces import ProcessorConfig, Uploader
12
+ from unstructured_ingest.logger import logger, make_default_logger
13
+ from unstructured_ingest.otel import OtelHandler
14
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
15
+ from unstructured_ingest.pipeline.steps.chunk import Chunker, ChunkStep
16
+ from unstructured_ingest.pipeline.steps.download import DownloaderT, DownloadStep
17
+ from unstructured_ingest.pipeline.steps.embed import Embedder, EmbedStep
18
+ from unstructured_ingest.pipeline.steps.filter import Filterer, FilterStep
19
+ from unstructured_ingest.pipeline.steps.index import IndexerT, IndexStep
20
+ from unstructured_ingest.pipeline.steps.partition import Partitioner, PartitionStep
21
+ from unstructured_ingest.pipeline.steps.stage import UploadStager, UploadStageStep
22
+ from unstructured_ingest.pipeline.steps.uncompress import Uncompressor, UncompressStep
23
+ from unstructured_ingest.pipeline.steps.upload import UploadStep
24
+ from unstructured_ingest.processes.chunker import ChunkerConfig
25
+ from unstructured_ingest.processes.connector_registry import (
26
+ ConnectionConfig,
27
+ DownloaderConfigT,
28
+ IndexerConfigT,
29
+ UploaderConfigT,
30
+ UploadStagerConfigT,
31
+ destination_registry,
32
+ source_registry,
19
33
  )
20
- from unstructured_ingest.pipeline.permissions import PermissionsDataCleaner
21
- from unstructured_ingest.pipeline.utils import get_ingest_doc_hash
34
+ from unstructured_ingest.processes.connectors.local import LocalUploader
35
+ from unstructured_ingest.processes.embedder import EmbedderConfig
36
+ from unstructured_ingest.processes.filter import FiltererConfig
37
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
38
+
39
+
40
+ class PipelineError(Exception):
41
+ pass
22
42
 
23
43
 
24
44
  @dataclass
25
- class Pipeline(DataClassJsonMixin):
26
- pipeline_context: PipelineContext
27
- doc_factory_node: DocFactoryNode
28
- source_node: SourceNode
29
- partition_node: Optional[PartitionNode] = None
30
- write_node: Optional[WriteNode] = None
31
- reformat_nodes: "list[ReformatNode]" = field(default_factory=list)
32
- permissions_node: Optional[PermissionsDataCleaner] = None
33
-
34
- def initialize(self):
35
- ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
36
-
37
- def get_nodes_str(self):
38
- nodes = [self.doc_factory_node, self.source_node, self.partition_node]
39
- nodes.extend(self.reformat_nodes)
40
- if self.write_node:
41
- nodes.append(self.write_node)
42
- nodes.append(Copier(pipeline_context=self.pipeline_context))
43
- return " -> ".join([node.__class__.__name__ for node in nodes])
44
-
45
- def expand_batch_docs(self, dict_docs: "list[dict[str, Any]]") -> "list[dict[str, Any]]":
46
- expanded_docs: list[dict[str, Any]] = []
47
- for d in dict_docs:
48
- doc = create_ingest_doc_from_dict(d)
49
- if isinstance(doc, BaseSingleIngestDoc):
50
- expanded_docs.append(doc.to_dict())
51
- elif isinstance(doc, BaseIngestDocBatch):
52
- expanded_docs.extend([single_doc.to_dict() for single_doc in doc.ingest_docs])
53
- else:
54
- raise ValueError(
55
- f"type of doc ({type(doc)}) is not a recognized type: "
56
- f"BaseSingleIngestDoc or BaseSingleIngestDoc"
45
+ class Pipeline:
46
+ context: ProcessorConfig
47
+
48
+ indexer: InitVar[IndexerT]
49
+ indexer_step: IndexStep = field(init=False)
50
+
51
+ downloader: InitVar[DownloaderT]
52
+ downloader_step: DownloadStep = field(init=False)
53
+
54
+ partitioner: InitVar[Partitioner]
55
+ partitioner_step: PartitionStep = field(init=False)
56
+
57
+ chunker: InitVar[Chunker | None] = None
58
+ chunker_step: ChunkStep | None = field(init=False, default=None)
59
+
60
+ embedder: InitVar[Embedder | None] = None
61
+ embedder_step: EmbedStep | None = field(init=False, default=None)
62
+
63
+ stager: InitVar[UploadStager | None] = None
64
+ stager_step: UploadStageStep | None = field(init=False, default=None)
65
+
66
+ uploader: InitVar[Uploader] = field(default=LocalUploader())
67
+ uploader_step: UploadStep | None = field(init=False, default=None)
68
+
69
+ uncompress_step: UncompressStep | None = field(init=False, default=None)
70
+
71
+ filterer: InitVar[Filterer | None] = None
72
+ filter_step: FilterStep | None = field(init=False, default=None)
73
+
74
+ def __post_init__(
75
+ self,
76
+ indexer: IndexerT,
77
+ downloader: DownloaderT,
78
+ partitioner: Partitioner,
79
+ chunker: Chunker | None = None,
80
+ embedder: Embedder | None = None,
81
+ stager: UploadStager | None = None,
82
+ uploader: Uploader | None = None,
83
+ filterer: Filterer | None = None,
84
+ ):
85
+ make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
86
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
87
+ otel_handler.init_trace()
88
+ self.indexer_step = IndexStep(process=indexer, context=self.context)
89
+ self.downloader_step = DownloadStep(process=downloader, context=self.context)
90
+ self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
91
+ self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
92
+ self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
93
+
94
+ self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
95
+
96
+ self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
97
+ self.uploader_step = UploadStep(process=uploader, context=self.context)
98
+ if self.context.uncompress:
99
+ process = Uncompressor()
100
+ self.uncompress_step = UncompressStep(process=process, context=self.context)
101
+
102
+ self.check_destination_connector()
103
+
104
+ def check_destination_connector(self):
105
+ # Make sure that if the set destination connector expects a stager, one is also set
106
+ if not self.uploader_step:
107
+ return
108
+ uploader_connector_type = self.uploader_step.process.connector_type
109
+ registry_entry = destination_registry[uploader_connector_type]
110
+ if registry_entry.upload_stager and self.stager_step is None:
111
+ try:
112
+ self.stager_step = UploadStageStep(
113
+ process=registry_entry.upload_stager(), context=self.context
57
114
  )
58
- return expanded_docs
115
+ return
116
+ except Exception as e:
117
+ logger.debug(f"failed to instantiate required stager on user's behalf: {e}")
118
+ raise ValueError(
119
+ f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
120
+ f"expects a stager of type {registry_entry.upload_stager.__name__} "
121
+ f"but one was not set"
122
+ )
123
+
124
+ def cleanup(self):
125
+ if self.context.delete_cache and Path(self.context.work_dir).exists():
126
+ logger.info(f"deleting cache directory: {self.context.work_dir}")
127
+ shutil.rmtree(self.context.work_dir)
128
+
129
+ def log_statuses(self):
130
+ if status := self.context.status:
131
+ logger.error(f"{len(status)} failed documents:")
132
+ for k, v in status.items():
133
+ for kk, vv in v.items():
134
+ logger.error(f"{k}: [{kk}] {vv}")
135
+
136
+ def _run_initialization(self):
137
+ failures = {}
138
+ init_kwargs = {}
139
+ for step in self._get_ordered_steps():
140
+ try:
141
+ step.process.init(**init_kwargs)
142
+ step.process.precheck()
143
+ # Make sure embedder dimensions available for downstream steps
144
+ if isinstance(step.process, Embedder):
145
+ embed_dimensions = step.process.config.get_embedder().dimension
146
+ init_kwargs["vector_length"] = embed_dimensions
147
+
148
+ except Exception as e:
149
+ failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
150
+ if failures:
151
+ for k, v in failures.items():
152
+ logger.error(f"Step initialization failure: {k}: {v}")
153
+ raise PipelineError("Initialization failed")
59
154
 
60
155
  def run(self):
156
+ otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
157
+ try:
158
+ with otel_handler.get_tracer().start_as_current_span(
159
+ "ingest process", record_exception=True
160
+ ):
161
+ self._run_initialization()
162
+ self._run()
163
+ finally:
164
+ self.log_statuses()
165
+ self.cleanup()
166
+ if self.context.status:
167
+ raise PipelineError("Pipeline did not run successfully")
168
+
169
+ def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
170
+ if not results:
171
+ return None
172
+ results = [r for r in results if r]
173
+ flat = []
174
+ for r in results:
175
+ if isinstance(r, list):
176
+ flat.extend(r)
177
+ else:
178
+ flat.append(r)
179
+ final = [f for f in flat if f]
180
+ return final or None
181
+
182
+ def _get_ordered_steps(self) -> list[PipelineStep]:
183
+ steps = [self.indexer_step, self.downloader_step]
184
+ if self.uncompress_step:
185
+ steps.append(self.uncompress_step)
186
+ steps.append(self.partitioner_step)
187
+ if self.chunker_step:
188
+ steps.append(self.chunker_step)
189
+ if self.embedder_step:
190
+ steps.append(self.embedder_step)
191
+ if self.stager_step:
192
+ steps.append(self.stager_step)
193
+ steps.append(self.uploader_step)
194
+ return steps
195
+
196
+ def apply_filter(self, records: list[dict]) -> list[dict]:
197
+ if not self.filter_step:
198
+ return records
199
+ data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
200
+ filtered_data = self.filter_step(data_to_filter)
201
+ filtered_data = [f for f in filtered_data if f is not None]
202
+ filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
203
+ filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
204
+ return filtered_records
205
+
206
+ def get_indices(self) -> list[dict]:
207
+ if self.indexer_step.process.is_async():
208
+
209
+ async def run_async():
210
+ output = []
211
+ async for i in self.indexer_step.run_async():
212
+ output.append(i)
213
+ return output
214
+
215
+ indices = asyncio.run(run_async())
216
+ else:
217
+ indices = self.indexer_step.run()
218
+ indices_inputs = [{"file_data_path": i} for i in indices]
219
+ return indices_inputs
220
+
221
+ def _run(self):
61
222
  logger.info(
62
- f"running pipeline: {self.get_nodes_str()} "
63
- f"with config: {self.pipeline_context.to_json()}",
223
+ f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
64
224
  )
65
- self.initialize()
66
- manager = mp.Manager()
67
- self.pipeline_context.ingest_docs_map = manager.dict()
68
- # -- Get the documents to be processed --
69
- dict_docs = self.doc_factory_node()
70
- dict_docs = [manager.dict(d) for d in dict_docs]
71
- if not dict_docs:
72
- logger.info("no docs found to process")
225
+ if self.context.mp_supported:
226
+ manager = mp.Manager()
227
+ self.context.status = manager.dict()
228
+ else:
229
+ self.context.status = {}
230
+
231
+ # Index into data source
232
+ indices_inputs = self.get_indices()
233
+ if not indices_inputs:
234
+ logger.info("No files to process after indexer, exiting")
73
235
  return
74
- logger.info(
75
- f"processing {len(dict_docs)} docs via "
76
- f"{self.pipeline_context.num_processes} processes",
77
- )
78
- for doc in dict_docs:
79
- self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc
80
- fetched_filenames = self.source_node(iterable=dict_docs)
81
- if self.source_node.read_config.download_only:
82
- logger.info("stopping pipeline after downloading files")
236
+
237
+ # Initial filtering on indexed content
238
+ indices_inputs = self.apply_filter(records=indices_inputs)
239
+ if not indices_inputs:
240
+ logger.info("No files to process after filtering indexed content, exiting")
241
+ return
242
+
243
+ # Download associated content to local file system
244
+ downloaded_data = self.downloader_step(indices_inputs)
245
+ downloaded_data = self.clean_results(results=downloaded_data)
246
+ if not downloaded_data:
247
+ logger.info("No files to process after downloader, exiting")
248
+ return
249
+
250
+ # Post download filtering
251
+ downloaded_data = self.apply_filter(records=downloaded_data)
252
+ if not downloaded_data:
253
+ logger.info("No files to process after filtering downloaded content, exiting")
83
254
  return
84
- if not fetched_filenames:
85
- logger.info("No files to run partition over")
255
+
256
+ # Run uncompress if available
257
+ if self.uncompress_step:
258
+ downloaded_data = self.uncompress_step(downloaded_data)
259
+ # Flatten list of lists
260
+ downloaded_data = self.clean_results(results=downloaded_data)
261
+
262
+ # Post uncompress filtering
263
+ downloaded_data = self.apply_filter(records=downloaded_data)
264
+ if not downloaded_data:
265
+ logger.info("No files to process after filtering uncompressed content, exiting")
266
+ return
267
+
268
+ if not downloaded_data or self.context.download_only:
86
269
  return
87
- # -- To support batches ingest docs, expand those into the populated single ingest
88
- # -- docs after downloading content
89
- dict_docs = self.expand_batch_docs(dict_docs=dict_docs)
90
- if self.partition_node is None:
91
- raise ValueError("partition node not set")
92
- partitioned_jsons = self.partition_node(iterable=dict_docs)
93
- if not partitioned_jsons:
94
- logger.info("No files to process after partitioning")
270
+
271
+ # Partition content
272
+ elements = self.partitioner_step(downloaded_data)
273
+ elements = self.clean_results(results=elements)
274
+ # Download data non longer needed, delete if possible
275
+ self.downloader_step.delete_cache()
276
+ elements = self.clean_results(results=elements)
277
+ if not elements:
278
+ logger.info("No files to process after partitioning, exiting")
95
279
  return
96
- for reformat_node in self.reformat_nodes:
97
- reformatted_jsons = reformat_node(iterable=partitioned_jsons)
98
- if not reformatted_jsons:
99
- logger.info(f"no files to process after {reformat_node.__class__.__name__}")
280
+
281
+ # Run element specific modifiers
282
+ last_step = self.partitioner_step
283
+ for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
284
+ elements = step(elements)
285
+ elements = self.clean_results(results=elements)
286
+ # Delete data from previous step if possible since no longer needed
287
+ last_step.delete_cache()
288
+ last_step = step
289
+ if not elements:
290
+ logger.info(f"no files to process after {step.__class__.__name__}, exiting")
100
291
  return
101
- partitioned_jsons = reformatted_jsons
102
292
 
103
- # -- Copy the final destination to the desired location --
104
- copier = Copier(
105
- pipeline_context=self.pipeline_context,
106
- )
107
- copier(iterable=partitioned_jsons)
293
+ # Upload the final result
294
+ self.uploader_step(iterable=elements)
295
+ last_step.delete_cache()
108
296
 
109
- if self.write_node:
110
- logger.info(
111
- f"uploading elements from {len(partitioned_jsons)} "
112
- "document(s) to the destination"
297
+ def __str__(self):
298
+ s = [str(self.indexer_step)]
299
+ if filter_step := self.filter_step:
300
+ s.append(str(filter_step))
301
+ s.append(str(self.downloader_step))
302
+ if filter_step := self.filter_step:
303
+ s.append(str(filter_step))
304
+ if uncompress_step := self.uncompress_step:
305
+ s.extend([str(uncompress_step), str(filter_step)])
306
+ s.append(str(self.partitioner_step))
307
+ if chunker_step := self.chunker_step:
308
+ s.append(str(chunker_step))
309
+ if embedder_step := self.embedder_step:
310
+ s.append(str(embedder_step))
311
+ if stager_step := self.stager_step:
312
+ s.append(str(stager_step))
313
+ s.append(str(self.uploader_step))
314
+ return " -> ".join(s)
315
+
316
+ @classmethod
317
+ def from_configs(
318
+ cls,
319
+ context: ProcessorConfig,
320
+ indexer_config: IndexerConfigT,
321
+ downloader_config: DownloaderConfigT,
322
+ source_connection_config: ConnectionConfig,
323
+ partitioner_config: PartitionerConfig,
324
+ filterer_config: FiltererConfig | None = None,
325
+ chunker_config: ChunkerConfig | None = None,
326
+ embedder_config: EmbedderConfig | None = None,
327
+ destination_connection_config: ConnectionConfig | None = None,
328
+ stager_config: UploadStagerConfigT | None = None,
329
+ uploader_config: UploaderConfigT | None = None,
330
+ ) -> "Pipeline":
331
+ # Get registry key based on indexer config
332
+ source_entry = {
333
+ k: v
334
+ for k, v in source_registry.items()
335
+ if type(indexer_config) is v.indexer_config
336
+ and type(downloader_config) is v.downloader_config
337
+ and type(source_connection_config) is v.connection_config
338
+ }
339
+ if len(source_entry) > 1:
340
+ raise ValueError(
341
+ f"multiple entries found matching provided indexer, "
342
+ f"downloader and connection configs: {source_entry}"
343
+ )
344
+ if len(source_entry) != 1:
345
+ raise ValueError(
346
+ "no entry found in source registry with matching indexer, "
347
+ "downloader and connection configs"
113
348
  )
114
- self.write_node(iterable=partitioned_jsons)
349
+ source = list(source_entry.values())[0]
350
+ pipeline_kwargs = {
351
+ "context": context,
352
+ "indexer": source.indexer(
353
+ index_config=indexer_config, connection_config=source_connection_config
354
+ ),
355
+ "downloader": source.downloader(
356
+ download_config=downloader_config, connection_config=source_connection_config
357
+ ),
358
+ "partitioner": Partitioner(config=partitioner_config),
359
+ }
360
+ if filterer_config:
361
+ pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
362
+ if chunker_config:
363
+ pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
364
+ if embedder_config:
365
+ pipeline_kwargs["embedder"] = Embedder(config=embedder_config)
366
+ if not uploader_config:
367
+ return Pipeline(**pipeline_kwargs)
115
368
 
116
- if self.permissions_node:
117
- self.permissions_node.cleanup_permissions()
369
+ destination_entry = {
370
+ k: v
371
+ for k, v in destination_registry.items()
372
+ if isinstance(uploader_config, v.uploader_config)
373
+ }
374
+ if destination_connection_config:
375
+ destination_entry = {
376
+ k: v
377
+ for k, v in destination_entry.items()
378
+ if isinstance(destination_connection_config, v.connection_config)
379
+ }
380
+ if stager_config:
381
+ destination_entry = {
382
+ k: v
383
+ for k, v in destination_entry.items()
384
+ if isinstance(stager_config, v.upload_stager_config)
385
+ }
386
+
387
+ if len(destination_entry) > 1:
388
+ raise ValueError(
389
+ f"multiple entries found matching provided uploader, "
390
+ f"stager and connection configs: {destination_entry}"
391
+ )
392
+ if len(destination_entry) != 1:
393
+ raise ValueError(
394
+ "no entry found in destination registry with matching uploader, "
395
+ "stager and connection configs"
396
+ )
397
+
398
+ destination = list(destination_entry.values())[0]
399
+ if stager_config:
400
+ pipeline_kwargs["stager"] = destination.upload_stager(
401
+ upload_stager_config=stager_config
402
+ )
403
+ if uploader_config:
404
+ uploader_kwargs = {"upload_config": uploader_config}
405
+ if destination_connection_config:
406
+ uploader_kwargs["connection_config"] = destination_connection_config
407
+ pipeline_kwargs["uploader"] = destination.uploader(**uploader_kwargs)
408
+ return cls(**pipeline_kwargs)
@@ -4,12 +4,12 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
+ from unstructured_ingest.data_types.file_data import FileData, file_data_from_file
8
+ from unstructured_ingest.logger import logger
9
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
10
+ from unstructured_ingest.processes.chunker import Chunker
7
11
  from unstructured_ingest.utils.data_prep import write_data
8
- from unstructured_ingest.v2.logger import logger
9
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
10
- from unstructured_ingest.v2.processes.chunker import Chunker
11
- from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
12
- from unstructured_ingest.v2.utils import serialize_base_model_json
12
+ from unstructured_ingest.utils.pydantic_models import serialize_base_model_json
13
13
 
14
14
  STEP_ID = "chunk"
15
15
 
@@ -6,11 +6,11 @@ from dataclasses import dataclass
6
6
  from pathlib import Path
7
7
  from typing import Callable, Optional, TypedDict, TypeVar
8
8
 
9
- from unstructured_ingest.v2.interfaces import Downloader, download_responses
10
- from unstructured_ingest.v2.logger import logger
11
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
- from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
13
- from unstructured_ingest.v2.utils import serialize_base_model_json
9
+ from unstructured_ingest.data_types.file_data import FileData, file_data_from_file
10
+ from unstructured_ingest.interfaces import Downloader, download_responses
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
13
+ from unstructured_ingest.utils.pydantic_models import serialize_base_model_json
14
14
 
15
15
  DownloaderT = TypeVar("DownloaderT", bound=Downloader)
16
16
 
@@ -4,12 +4,12 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
+ from unstructured_ingest.data_types.file_data import FileData, file_data_from_file
8
+ from unstructured_ingest.logger import logger
9
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
10
+ from unstructured_ingest.processes.embedder import Embedder
7
11
  from unstructured_ingest.utils.data_prep import write_data
8
- from unstructured_ingest.v2.logger import logger
9
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
10
- from unstructured_ingest.v2.processes.embedder import Embedder
11
- from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
12
- from unstructured_ingest.v2.utils import serialize_base_model_json
12
+ from unstructured_ingest.utils.pydantic_models import serialize_base_model_json
13
13
 
14
14
  STEP_ID = "embed"
15
15
 
@@ -2,10 +2,10 @@ import asyncio
2
2
  from dataclasses import dataclass
3
3
  from typing import Callable, Optional
4
4
 
5
- from unstructured_ingest.v2.logger import logger
6
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
7
- from unstructured_ingest.v2.processes.filter import Filterer
8
- from unstructured_ingest.v2.types.file_data import file_data_from_file
5
+ from unstructured_ingest.data_types.file_data import file_data_from_file
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
8
+ from unstructured_ingest.processes.filter import Filterer
9
9
 
10
10
  STEP_ID = "filter"
11
11
 
@@ -3,11 +3,11 @@ import json
3
3
  from dataclasses import dataclass
4
4
  from typing import AsyncGenerator, Generator, Optional, TypeVar
5
5
 
6
- from unstructured_ingest.v2.interfaces.indexer import Indexer
7
- from unstructured_ingest.v2.logger import logger
8
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
- from unstructured_ingest.v2.pipeline.otel import instrument
10
- from unstructured_ingest.v2.utils import serialize_base_model_json
6
+ from unstructured_ingest.interfaces.indexer import Indexer
7
+ from unstructured_ingest.logger import logger
8
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
9
+ from unstructured_ingest.pipeline.otel import instrument
10
+ from unstructured_ingest.utils.pydantic_models import serialize_base_model_json
11
11
 
12
12
  IndexerT = TypeVar("IndexerT", bound=Indexer)
13
13
 
@@ -4,12 +4,12 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
+ from unstructured_ingest.data_types.file_data import FileData, file_data_from_file
8
+ from unstructured_ingest.logger import logger
9
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
10
+ from unstructured_ingest.processes.partitioner import Partitioner
7
11
  from unstructured_ingest.utils.data_prep import write_data
8
- from unstructured_ingest.v2.logger import logger
9
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
10
- from unstructured_ingest.v2.processes.partitioner import Partitioner
11
- from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
12
- from unstructured_ingest.v2.utils import serialize_base_model_json
12
+ from unstructured_ingest.utils.pydantic_models import serialize_base_model_json
13
13
 
14
14
  STEP_ID = "partition"
15
15
 
@@ -4,11 +4,11 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
- from unstructured_ingest.v2.interfaces import UploadStager
8
- from unstructured_ingest.v2.logger import logger
9
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
10
- from unstructured_ingest.v2.types.file_data import file_data_from_file
11
- from unstructured_ingest.v2.utils import serialize_base_model_json
7
+ from unstructured_ingest.data_types.file_data import file_data_from_file
8
+ from unstructured_ingest.interfaces import UploadStager
9
+ from unstructured_ingest.logger import logger
10
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
11
+ from unstructured_ingest.utils.pydantic_models import serialize_base_model_json
12
12
 
13
13
  STEP_ID = "upload_stage"
14
14
 
@@ -3,10 +3,10 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Callable, TypedDict
5
5
 
6
- from unstructured_ingest.v2.logger import logger
7
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
- from unstructured_ingest.v2.processes.uncompress import Uncompressor
9
- from unstructured_ingest.v2.types.file_data import file_data_from_file
6
+ from unstructured_ingest.data_types.file_data import file_data_from_file
7
+ from unstructured_ingest.logger import logger
8
+ from unstructured_ingest.pipeline.interfaces import PipelineStep
9
+ from unstructured_ingest.processes.uncompress import Uncompressor
10
10
 
11
11
  STEP_ID = "uncompress"
12
12