unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
  279. unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
examples/mongodb.py ADDED
@@ -0,0 +1,53 @@
1
+ import random
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.chunker import ChunkerConfig
8
+ from unstructured_ingest.processes.connectors.local import (
9
+ LocalConnectionConfig,
10
+ LocalDownloaderConfig,
11
+ LocalIndexerConfig,
12
+ )
13
+ from unstructured_ingest.processes.connectors.mongodb import (
14
+ CONNECTOR_TYPE,
15
+ MongoDBAccessConfig,
16
+ MongoDBConnectionConfig,
17
+ MongoDBUploaderConfig,
18
+ MongoDBUploadStagerConfig,
19
+ )
20
+ from unstructured_ingest.processes.embedder import EmbedderConfig
21
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
22
+
23
+ base_path = Path(__file__).parent.parent.parent.parent
24
+ docs_path = base_path / "example-docs"
25
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
26
+ output_path = work_dir / "output"
27
+ download_path = work_dir / "download"
28
+
29
+ if __name__ == "__main__":
30
+ logger.info(f"writing all content in: {work_dir.resolve()}")
31
+ Pipeline.from_configs(
32
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
33
+ indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
34
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
35
+ source_connection_config=LocalConnectionConfig(),
36
+ partitioner_config=PartitionerConfig(strategy="fast"),
37
+ chunker_config=ChunkerConfig(
38
+ chunking_strategy="by_title",
39
+ chunk_include_orig_elements=False,
40
+ chunk_max_characters=1500,
41
+ chunk_multipage_sections=True,
42
+ ),
43
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
44
+ destination_connection_config=MongoDBConnectionConfig(
45
+ access_config=MongoDBAccessConfig(uri=None),
46
+ host="localhost",
47
+ port=27017,
48
+ collection=f"test-collection-{random.randint(1000, 9999)}",
49
+ database="testDatabase",
50
+ ),
51
+ stager_config=MongoDBUploadStagerConfig(),
52
+ uploader_config=MongoDBUploaderConfig(batch_size=10),
53
+ ).run()
examples/opensearch.py ADDED
@@ -0,0 +1,50 @@
1
+ from pathlib import Path
2
+
3
+ from unstructured_ingest.interfaces import ProcessorConfig
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.pipeline.pipeline import Pipeline
6
+ from unstructured_ingest.processes.chunker import ChunkerConfig
7
+ from unstructured_ingest.processes.connectors.local import (
8
+ LocalConnectionConfig,
9
+ LocalDownloaderConfig,
10
+ LocalIndexerConfig,
11
+ )
12
+ from unstructured_ingest.processes.connectors.opensearch import (
13
+ CONNECTOR_TYPE,
14
+ OpenSearchAccessConfig,
15
+ OpenSearchConnectionConfig,
16
+ OpenSearchUploaderConfig,
17
+ OpenSearchUploadStagerConfig,
18
+ )
19
+ from unstructured_ingest.processes.embedder import EmbedderConfig
20
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
21
+
22
+ base_path = Path(__file__).parent.parent.parent.parent
23
+ docs_path = base_path / "example-docs"
24
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
+ output_path = work_dir / "output"
26
+ download_path = work_dir / "download"
27
+
28
+ if __name__ == "__main__":
29
+ logger.info(f"writing all content in: {work_dir.resolve()}")
30
+ Pipeline.from_configs(
31
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32
+ indexer_config=LocalIndexerConfig(
33
+ input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
34
+ ),
35
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
36
+ source_connection_config=LocalConnectionConfig(),
37
+ partitioner_config=PartitionerConfig(strategy="fast"),
38
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
39
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
40
+ destination_connection_config=OpenSearchConnectionConfig(
41
+ hosts="http://localhost:9247",
42
+ username="admin",
43
+ use_ssl=True,
44
+ access_config=OpenSearchAccessConfig(password="admin"),
45
+ ),
46
+ stager_config=OpenSearchUploadStagerConfig(index_name="ingest-test-destination"),
47
+ uploader_config=OpenSearchUploaderConfig(
48
+ index_name="ingest-test-destination", batch_size_bytes=150
49
+ ),
50
+ ).run()
examples/pinecone.py ADDED
@@ -0,0 +1,57 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.chunker import ChunkerConfig
8
+ from unstructured_ingest.processes.connectors.local import (
9
+ LocalConnectionConfig,
10
+ LocalDownloaderConfig,
11
+ LocalIndexerConfig,
12
+ )
13
+ from unstructured_ingest.processes.connectors.pinecone import (
14
+ CONNECTOR_TYPE,
15
+ PineconeAccessConfig,
16
+ PineconeConnectionConfig,
17
+ PineconeUploaderConfig,
18
+ PineconeUploadStagerConfig,
19
+ )
20
+ from unstructured_ingest.processes.embedder import EmbedderConfig
21
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
22
+
23
+ base_path = Path(__file__).parent.parent.parent.parent
24
+ docs_path = base_path / "example-docs"
25
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
26
+ output_path = work_dir / "output"
27
+ download_path = work_dir / "download"
28
+
29
+ if __name__ == "__main__":
30
+ logger.info(f"writing all content in: {work_dir.resolve()}")
31
+ Pipeline.from_configs(
32
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
33
+ indexer_config=LocalIndexerConfig(
34
+ input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
35
+ ),
36
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
37
+ source_connection_config=LocalConnectionConfig(),
38
+ partitioner_config=PartitionerConfig(strategy="fast"),
39
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
40
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
41
+ destination_connection_config=PineconeConnectionConfig(
42
+ # You'll need to set PINECONE_API_KEY environment variable to run this example
43
+ access_config=PineconeAccessConfig(pinecone_api_key=os.getenv("PINECONE_API_KEY")),
44
+ index_name=os.getenv(
45
+ "PINECONE_INDEX",
46
+ default="your index name here. e.g. my-index,"
47
+ "or define in environment variable PINECONE_INDEX",
48
+ ),
49
+ environment=os.getenv(
50
+ "PINECONE_ENVIRONMENT",
51
+ default="your environment name here. e.g. us-east-1,"
52
+ "or define in environment variable PINECONE_ENVIRONMENT",
53
+ ),
54
+ ),
55
+ stager_config=PineconeUploadStagerConfig(),
56
+ uploader_config=PineconeUploaderConfig(batch_size=10, num_processes=2),
57
+ ).run()
examples/s3.py ADDED
@@ -0,0 +1,38 @@
1
+ from pathlib import Path
2
+
3
+ from unstructured_ingest.interfaces import ProcessorConfig
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.pipeline.pipeline import Pipeline
6
+ from unstructured_ingest.processes.chunker import ChunkerConfig
7
+ from unstructured_ingest.processes.connectors.fsspec.s3 import (
8
+ CONNECTOR_TYPE,
9
+ S3ConnectionConfig,
10
+ S3DownloaderConfig,
11
+ S3IndexerConfig,
12
+ )
13
+ from unstructured_ingest.processes.connectors.local import (
14
+ LocalUploaderConfig,
15
+ )
16
+ from unstructured_ingest.processes.embedder import EmbedderConfig
17
+ from unstructured_ingest.processes.filter import FiltererConfig
18
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
19
+
20
+ base_path = Path(__file__).parent.parent.parent.parent
21
+ docs_path = base_path / "example-docs"
22
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
23
+ output_path = work_dir / "output"
24
+ download_path = work_dir / "download"
25
+
26
+ if __name__ == "__main__":
27
+ logger.info(f"writing all content in: {work_dir.resolve()}")
28
+ Pipeline.from_configs(
29
+ context=ProcessorConfig(work_dir=str(work_dir.resolve()), verbose=True, iter_delete=True),
30
+ indexer_config=S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/"),
31
+ downloader_config=S3DownloaderConfig(download_dir=download_path),
32
+ source_connection_config=S3ConnectionConfig(anonymous=True),
33
+ partitioner_config=PartitionerConfig(strategy="fast"),
34
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
35
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
36
+ uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
37
+ filterer_config=FiltererConfig(max_file_size=900000),
38
+ ).run()
examples/salesforce.py ADDED
@@ -0,0 +1,44 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.chunker import ChunkerConfig
8
+ from unstructured_ingest.processes.connectors.local import (
9
+ LocalUploaderConfig,
10
+ )
11
+ from unstructured_ingest.processes.connectors.salesforce import (
12
+ CONNECTOR_TYPE,
13
+ SalesforceAccessConfig,
14
+ SalesforceConnectionConfig,
15
+ SalesforceDownloaderConfig,
16
+ SalesforceIndexerConfig,
17
+ )
18
+ from unstructured_ingest.processes.embedder import EmbedderConfig
19
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
20
+
21
+ base_path = Path(__file__).parent.parent.parent.parent
22
+ docs_path = base_path / "example-docs"
23
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
24
+ output_path = work_dir / "output"
25
+ download_path = work_dir / "download"
26
+
27
+ if __name__ == "__main__":
28
+ logger.info(f"writing all content in: {work_dir.resolve()}")
29
+ Pipeline.from_configs(
30
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
31
+ indexer_config=SalesforceIndexerConfig(categories=["Campaign", "EmailMessage"]),
32
+ downloader_config=SalesforceDownloaderConfig(download_dir=download_path),
33
+ source_connection_config=SalesforceConnectionConfig(
34
+ SalesforceAccessConfig(
35
+ consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"),
36
+ private_key=os.getenv("SALESFORCE_PRIVATE_KEY"),
37
+ ),
38
+ username=os.getenv("SALESFORCE_USERNAME"),
39
+ ),
40
+ partitioner_config=PartitionerConfig(strategy="fast"),
41
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
42
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
43
+ uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
44
+ ).run()
examples/sharepoint.py ADDED
@@ -0,0 +1,47 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.connectors.local import (
8
+ LocalUploaderConfig,
9
+ )
10
+ from unstructured_ingest.processes.connectors.sharepoint import (
11
+ CONNECTOR_TYPE,
12
+ SharepointAccessConfig,
13
+ SharepointConnectionConfig,
14
+ SharepointDownloaderConfig,
15
+ SharepointIndexerConfig,
16
+ SharepointPermissionsConfig,
17
+ )
18
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
19
+
20
+ base_path = Path(__file__).parent.parent.parent.parent
21
+ docs_path = base_path / "example-docs"
22
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
23
+ output_path = work_dir / "output"
24
+ download_path = work_dir / "download"
25
+
26
+
27
+ if __name__ == "__main__":
28
+ logger.info(f"writing all content in: {work_dir.resolve()}")
29
+ Pipeline.from_configs(
30
+ context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
31
+ indexer_config=SharepointIndexerConfig(),
32
+ downloader_config=SharepointDownloaderConfig(download_dir=download_path),
33
+ source_connection_config=SharepointConnectionConfig(
34
+ client_id=os.getenv("SHAREPOINT_CLIENT_ID"),
35
+ site=os.getenv("SHAREPOINT_SITE"),
36
+ access_config=SharepointAccessConfig(client_cred=os.getenv("SHAREPOINT_CRED")),
37
+ permissions_config=SharepointPermissionsConfig(
38
+ permissions_application_id=os.getenv("SHAREPOINT_PERMISSIONS_APP_ID"),
39
+ permissions_client_cred=os.getenv("SHAREPOINT_PERMISSIONS_APP_CRED"),
40
+ permissions_tenant=os.getenv("SHAREPOINT_PERMISSIONS_TENANT"),
41
+ ),
42
+ ),
43
+ partitioner_config=PartitionerConfig(strategy="fast"),
44
+ # chunker_config=ChunkerConfig(chunking_strategy="by_title"),
45
+ # embedder_config=EmbedderConfig(embedding_provider="huggingface"),
46
+ uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
47
+ ).run()
@@ -0,0 +1,49 @@
1
+ from pathlib import Path
2
+
3
+ from unstructured_ingest.interfaces import ProcessorConfig
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.pipeline.pipeline import Pipeline
6
+ from unstructured_ingest.processes.chunker import ChunkerConfig
7
+ from unstructured_ingest.processes.connectors.local import (
8
+ LocalConnectionConfig,
9
+ LocalDownloaderConfig,
10
+ LocalIndexerConfig,
11
+ )
12
+ from unstructured_ingest.processes.connectors.singlestore import (
13
+ CONNECTOR_TYPE,
14
+ SingleStoreAccessConfig,
15
+ SingleStoreConnectionConfig,
16
+ SingleStoreUploaderConfig,
17
+ SingleStoreUploadStagerConfig,
18
+ )
19
+ from unstructured_ingest.processes.embedder import EmbedderConfig
20
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
21
+
22
+ base_path = Path(__file__).parent.parent.parent.parent
23
+ docs_path = base_path / "example-docs"
24
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
+ output_path = work_dir / "output"
26
+ download_path = work_dir / "download"
27
+
28
+ if __name__ == "__main__":
29
+ logger.info(f"writing all content in: {work_dir.resolve()}")
30
+ Pipeline.from_configs(
31
+ context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
32
+ indexer_config=LocalIndexerConfig(
33
+ input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
34
+ ),
35
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
36
+ source_connection_config=LocalConnectionConfig(),
37
+ partitioner_config=PartitionerConfig(strategy="fast"),
38
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
39
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
40
+ destination_connection_config=SingleStoreConnectionConfig(
41
+ access_config=SingleStoreAccessConfig(password="password"),
42
+ host="localhost",
43
+ port=3306,
44
+ database="ingest_test",
45
+ user="root",
46
+ ),
47
+ stager_config=SingleStoreUploadStagerConfig(),
48
+ uploader_config=SingleStoreUploaderConfig(table_name="elements"),
49
+ ).run()
examples/sql.py ADDED
@@ -0,0 +1,90 @@
1
+ import os
2
+ import sqlite3
3
+ from pathlib import Path
4
+
5
+ from unstructured_ingest.interfaces import ProcessorConfig
6
+ from unstructured_ingest.logger import logger
7
+ from unstructured_ingest.pipeline.pipeline import Pipeline
8
+ from unstructured_ingest.processes.chunker import ChunkerConfig
9
+ from unstructured_ingest.processes.connectors.local import (
10
+ LocalConnectionConfig,
11
+ LocalDownloaderConfig,
12
+ LocalIndexerConfig,
13
+ )
14
+ from unstructured_ingest.processes.connectors.sql import (
15
+ CONNECTOR_TYPE,
16
+ POSTGRESQL_DB,
17
+ SQLITE_DB,
18
+ SQLAccessConfig,
19
+ SQLConnectionConfig,
20
+ SQLUploaderConfig,
21
+ SQLUploadStagerConfig,
22
+ )
23
+ from unstructured_ingest.processes.embedder import EmbedderConfig
24
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
25
+
26
+ base_path = Path(__file__).parent.parent.parent.parent
27
+ docs_path = base_path / "example-docs"
28
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
29
+ output_path = work_dir / "output"
30
+ download_path = work_dir / "download"
31
+
32
+ SQLITE_DB_PATH = "test-sql-db.sqlite"
33
+
34
+ if __name__ == "__main__":
35
+ logger.info(f"writing all content in: {work_dir.resolve()}")
36
+
37
+ configs = {
38
+ "context": ProcessorConfig(work_dir=str(work_dir.resolve())),
39
+ "indexer_config": LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
40
+ "downloader_config": LocalDownloaderConfig(download_dir=download_path),
41
+ "source_connection_config": LocalConnectionConfig(),
42
+ "partitioner_config": PartitionerConfig(strategy="fast"),
43
+ "chunker_config": ChunkerConfig(
44
+ chunking_strategy="by_title",
45
+ chunk_include_orig_elements=False,
46
+ chunk_max_characters=1500,
47
+ chunk_multipage_sections=True,
48
+ ),
49
+ "embedder_config": EmbedderConfig(embedding_provider="huggingface"),
50
+ "stager_config": SQLUploadStagerConfig(),
51
+ "uploader_config": SQLUploaderConfig(batch_size=10),
52
+ }
53
+
54
+ if os.path.exists(SQLITE_DB):
55
+ os.remove(SQLITE_DB)
56
+
57
+ connection = sqlite3.connect(database=SQLITE_DB)
58
+
59
+ query = None
60
+ script_path = (
61
+ Path(__file__).parent.parent.parent.parent.parent
62
+ / Path("test_e2e/env_setup/sql/sqlite-schema.sql")
63
+ ).resolve()
64
+ with open(script_path) as f:
65
+ query = f.read()
66
+ cursor = connection.cursor()
67
+ cursor.executescript(query)
68
+ connection.close()
69
+
70
+ # sqlite test first
71
+ Pipeline.from_configs(
72
+ destination_connection_config=SQLConnectionConfig(
73
+ db_type=SQLITE_DB,
74
+ database=SQLITE_DB_PATH,
75
+ access_config=SQLAccessConfig(),
76
+ ),
77
+ **configs,
78
+ ).run()
79
+
80
+ # now, pg with pgvector
81
+ Pipeline.from_configs(
82
+ destination_connection_config=SQLConnectionConfig(
83
+ db_type=POSTGRESQL_DB,
84
+ database="elements",
85
+ host="localhost",
86
+ port=5433,
87
+ access_config=SQLAccessConfig(username="unstructured", password="test"),
88
+ ),
89
+ **configs,
90
+ ).run()
examples/vectara.py ADDED
@@ -0,0 +1,54 @@
1
+ from pathlib import Path
2
+
3
+ from unstructured_ingest.interfaces import ProcessorConfig
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.pipeline.pipeline import Pipeline
6
+ from unstructured_ingest.processes.chunker import ChunkerConfig
7
+ from unstructured_ingest.processes.connectors.local import (
8
+ LocalConnectionConfig,
9
+ LocalDownloaderConfig,
10
+ LocalIndexerConfig,
11
+ )
12
+ from unstructured_ingest.processes.connectors.vectara import (
13
+ CONNECTOR_TYPE,
14
+ VectaraAccessConfig,
15
+ VectaraConnectionConfig,
16
+ VectaraUploaderConfig,
17
+ VectaraUploadStagerConfig,
18
+ )
19
+ from unstructured_ingest.processes.embedder import EmbedderConfig
20
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
21
+
22
+ base_path = Path(__file__).parent.parent.parent.parent
23
+ docs_path = base_path / "example-docs"
24
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
+ output_path = work_dir / "output"
26
+ download_path = work_dir / "download"
27
+
28
+ if __name__ == "__main__":
29
+ logger.info(f"writing all content in: {work_dir.resolve()}")
30
+ Pipeline.from_configs(
31
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32
+ indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
33
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
34
+ source_connection_config=LocalConnectionConfig(),
35
+ partitioner_config=PartitionerConfig(strategy="fast"),
36
+ chunker_config=ChunkerConfig(
37
+ chunking_strategy="by_title",
38
+ chunk_include_orig_elements=False,
39
+ chunk_max_characters=1500,
40
+ chunk_multipage_sections=True,
41
+ ),
42
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
43
+ destination_connection_config=VectaraConnectionConfig(
44
+ access_config=VectaraAccessConfig(
45
+ oauth_client_id="fill oauth_client_id", oauth_secret="fill oauth_secret"
46
+ ),
47
+ customer_id="fill customer_id",
48
+ corpus_name="fill corpus_name",
49
+ corpus_key="fill corpus_key",
50
+ token_url="fill token_url",
51
+ ),
52
+ stager_config=VectaraUploadStagerConfig(batch_size=10),
53
+ uploader_config=VectaraUploaderConfig(),
54
+ ).run()
examples/weaviate.py ADDED
@@ -0,0 +1,44 @@
1
+ from pathlib import Path
2
+
3
+ from unstructured_ingest.interfaces import ProcessorConfig
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.pipeline.pipeline import Pipeline
6
+ from unstructured_ingest.processes.chunker import ChunkerConfig
7
+ from unstructured_ingest.processes.connectors.local import (
8
+ LocalConnectionConfig,
9
+ LocalDownloaderConfig,
10
+ LocalIndexerConfig,
11
+ )
12
+ from unstructured_ingest.processes.connectors.weaviate.local import (
13
+ CONNECTOR_TYPE,
14
+ LocalWeaviateConnectionConfig,
15
+ LocalWeaviateUploaderConfig,
16
+ LocalWeaviateUploadStagerConfig,
17
+ )
18
+ from unstructured_ingest.processes.embedder import EmbedderConfig
19
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
20
+
21
+ base_path = Path(__file__).parent.parent.parent.parent
22
+ docs_path = base_path / "example-docs"
23
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
24
+ output_path = work_dir / "output"
25
+ download_path = work_dir / "download"
26
+
27
+ if __name__ == "__main__":
28
+ logger.info(f"writing all content in: {work_dir.resolve()}")
29
+ Pipeline.from_configs(
30
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
31
+ indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
32
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
33
+ source_connection_config=LocalConnectionConfig(),
34
+ partitioner_config=PartitionerConfig(strategy="fast"),
35
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
36
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
37
+ destination_connection_config=LocalWeaviateConnectionConfig(
38
+ # Connects to http://localhost:8080
39
+ ),
40
+ stager_config=LocalWeaviateUploadStagerConfig(),
41
+ uploader_config=LocalWeaviateUploaderConfig(
42
+ collection="elements", batch_size=10, dynamic_batch=False
43
+ ),
44
+ ).run()
@@ -4,7 +4,7 @@ from pathlib import Path
4
4
  import pytest
5
5
 
6
6
  from test.integration.utils import requires_env
7
- from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
7
+ from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
8
8
 
9
9
  int_test_dir = Path(__file__).parent
10
10
  assets_dir = int_test_dir / "assets"
@@ -4,7 +4,7 @@ from typing import Generator
4
4
 
5
5
  import pytest
6
6
 
7
- from unstructured_ingest.v2.logger import logger
7
+ from unstructured_ingest.logger import logger
8
8
 
9
9
  FILENAME = Path("DA-1p-with-duplicate-pages.pdf.json")
10
10
 
@@ -20,8 +20,9 @@ from test.integration.connectors.utils.validation.source import (
20
20
  source_connector_validation,
21
21
  )
22
22
  from test.integration.utils import requires_env
23
- from unstructured_ingest.v2.errors import UserAuthError, UserError
24
- from unstructured_ingest.v2.processes.connectors.databricks.volumes_native import (
23
+ from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
24
+ from unstructured_ingest.errors_v2 import UserAuthError, UserError
25
+ from unstructured_ingest.processes.connectors.databricks.volumes_native import (
25
26
  CONNECTOR_TYPE,
26
27
  DatabricksNativeVolumesAccessConfig,
27
28
  DatabricksNativeVolumesConnectionConfig,
@@ -32,7 +33,6 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native impor
32
33
  DatabricksNativeVolumesUploader,
33
34
  DatabricksNativeVolumesUploaderConfig,
34
35
  )
35
- from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
36
36
 
37
37
 
38
38
  @dataclass
@@ -13,7 +13,7 @@ from test.integration.connectors.utils.validation.source import (
13
13
  )
14
14
  from test.integration.utils import requires_env
15
15
  from unstructured_ingest.error import SourceConnectionError
16
- from unstructured_ingest.v2.processes.connectors.discord import (
16
+ from unstructured_ingest.processes.connectors.discord import (
17
17
  CONNECTOR_TYPE,
18
18
  DiscordAccessConfig,
19
19
  DiscordConnectionConfig,
@@ -10,14 +10,14 @@ from test.integration.connectors.utils.validation.destination import (
10
10
  StagerValidationConfigs,
11
11
  stager_validation,
12
12
  )
13
- from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
13
+ from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
14
+ from unstructured_ingest.processes.connectors.duckdb.duckdb import (
14
15
  CONNECTOR_TYPE,
15
16
  DuckDBConnectionConfig,
16
17
  DuckDBUploader,
17
18
  DuckDBUploaderConfig,
18
19
  DuckDBUploadStager,
19
20
  )
20
- from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
21
21
 
22
22
 
23
23
  @pytest.fixture
@@ -9,7 +9,8 @@ import pytest
9
9
 
10
10
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
11
11
  from test.integration.utils import requires_env
12
- from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
12
+ from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
13
+ from unstructured_ingest.processes.connectors.duckdb.motherduck import (
13
14
  CONNECTOR_TYPE,
14
15
  MotherDuckAccessConfig,
15
16
  MotherDuckConnectionConfig,
@@ -17,7 +18,6 @@ from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
17
18
  MotherDuckUploaderConfig,
18
19
  MotherDuckUploadStager,
19
20
  )
20
- from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
21
21
 
22
22
 
23
23
  @pytest.fixture
@@ -22,8 +22,8 @@ from test.integration.connectors.utils.validation.source import (
22
22
  source_connector_validation,
23
23
  )
24
24
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
25
- from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
26
- from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch import (
25
+ from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
26
+ from unstructured_ingest.processes.connectors.elasticsearch.elasticsearch import (
27
27
  CONNECTOR_TYPE,
28
28
  ElasticsearchAccessConfig,
29
29
  ElasticsearchConnectionConfig,
@@ -20,11 +20,12 @@ from test.integration.connectors.utils.validation.source import (
20
20
  SourceValidationConfigs,
21
21
  source_connector_validation,
22
22
  )
23
+ from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
23
24
  from unstructured_ingest.error import (
24
25
  DestinationConnectionError,
25
26
  SourceConnectionError,
26
27
  )
27
- from unstructured_ingest.v2.processes.connectors.elasticsearch.opensearch import (
28
+ from unstructured_ingest.processes.connectors.elasticsearch.opensearch import (
28
29
  CONNECTOR_TYPE,
29
30
  OpenSearchAccessConfig,
30
31
  OpenSearchConnectionConfig,
@@ -37,7 +38,6 @@ from unstructured_ingest.v2.processes.connectors.elasticsearch.opensearch import
37
38
  OpenSearchUploadStager,
38
39
  OpenSearchUploadStagerConfig,
39
40
  )
40
- from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
41
41
 
42
42
  SOURCE_INDEX_NAME = "movies"
43
43
  DESTINATION_INDEX_NAME = "elements"