unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +49 -0
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/processes/connectors/github.py +221 -0
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
examples/airtable.py ADDED
@@ -0,0 +1,44 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.chunker import ChunkerConfig
8
+ from unstructured_ingest.processes.connectors.airtable import (
9
+ CONNECTOR_TYPE,
10
+ AirtableAccessConfig,
11
+ AirtableConnectionConfig,
12
+ AirtableDownloaderConfig,
13
+ AirtableIndexerConfig,
14
+ )
15
+ from unstructured_ingest.processes.connectors.local import (
16
+ LocalUploaderConfig,
17
+ )
18
+ from unstructured_ingest.processes.embedder import EmbedderConfig
19
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
20
+
21
+ base_path = Path(__file__).parent.parent.parent.parent
22
+ docs_path = base_path / "example-docs"
23
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
24
+ output_path = work_dir / "output"
25
+ download_path = work_dir / "download"
26
+
27
+ if __name__ == "__main__":
28
+ logger.info(f"writing all content in: {work_dir.resolve()}")
29
+ Pipeline.from_configs(
30
+ context=ProcessorConfig(work_dir=str(work_dir.resolve()), verbose=True),
31
+ indexer_config=AirtableIndexerConfig(
32
+ list_of_paths=["app5YQxSfp220fWtm", "appJ43QmP8I17zu88"]
33
+ ),
34
+ downloader_config=AirtableDownloaderConfig(download_dir=download_path),
35
+ source_connection_config=AirtableConnectionConfig(
36
+ access_config=AirtableAccessConfig(
37
+ personal_access_token=os.getenv("AIRTABLE_PERSONAL_ACCESS_TOKEN")
38
+ )
39
+ ),
40
+ partitioner_config=PartitionerConfig(strategy="fast"),
41
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
42
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
43
+ uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
44
+ ).run()
@@ -0,0 +1,55 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.chunker import ChunkerConfig
8
+ from unstructured_ingest.processes.connectors.azure_ai_search import (
9
+ CONNECTOR_TYPE,
10
+ AzureAISearchAccessConfig,
11
+ AzureAISearchConnectionConfig,
12
+ AzureAISearchUploaderConfig,
13
+ AzureAISearchUploadStagerConfig,
14
+ )
15
+ from unstructured_ingest.processes.connectors.local import (
16
+ LocalConnectionConfig,
17
+ LocalDownloaderConfig,
18
+ LocalIndexerConfig,
19
+ )
20
+ from unstructured_ingest.processes.embedder import EmbedderConfig
21
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
22
+
23
+ base_path = Path(__file__).parent.parent.parent.parent
24
+ docs_path = base_path / "example-docs"
25
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
26
+ output_path = work_dir / "output"
27
+ download_path = work_dir / "download"
28
+
29
+ if __name__ == "__main__":
30
+ logger.info(f"writing all content in: {work_dir.resolve()}")
31
+ index_name = "ingest-test-destination"
32
+ Pipeline.from_configs(
33
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
34
+ indexer_config=LocalIndexerConfig(
35
+ input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
36
+ ),
37
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
38
+ source_connection_config=LocalConnectionConfig(),
39
+ partitioner_config=PartitionerConfig(strategy="fast"),
40
+ chunker_config=ChunkerConfig(
41
+ chunking_strategy="by_title", chunk_include_orig_elements=False
42
+ ),
43
+ embedder_config=EmbedderConfig(
44
+ embedding_provider="openai", embedding_api_key=os.getenv("OPENAI_API_KEY")
45
+ ),
46
+ destination_connection_config=AzureAISearchConnectionConfig(
47
+ access_config=AzureAISearchAccessConfig(
48
+ azure_ai_search_key=os.getenv("AZURE_SEARCH_API_KEY")
49
+ ),
50
+ index=os.getenv("AZURE_SEARCH_INDEX"),
51
+ endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
52
+ ),
53
+ uploader_config=AzureAISearchUploaderConfig(batch_size=10),
54
+ stager_config=AzureAISearchUploadStagerConfig(),
55
+ ).run()
examples/chroma.py ADDED
@@ -0,0 +1,54 @@
1
+ import random
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.chunker import ChunkerConfig
8
+ from unstructured_ingest.processes.connectors.chroma import (
9
+ CONNECTOR_TYPE,
10
+ ChromaAccessConfig,
11
+ ChromaConnectionConfig,
12
+ ChromaUploaderConfig,
13
+ ChromaUploadStagerConfig,
14
+ )
15
+ from unstructured_ingest.processes.connectors.local import (
16
+ LocalConnectionConfig,
17
+ LocalDownloaderConfig,
18
+ LocalIndexerConfig,
19
+ )
20
+ from unstructured_ingest.processes.embedder import EmbedderConfig
21
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
22
+
23
+ base_path = Path(__file__).parent.parent.parent.parent
24
+ docs_path = base_path / "example-docs"
25
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
26
+ output_path = work_dir / "output"
27
+ download_path = work_dir / "download"
28
+
29
+ if __name__ == "__main__":
30
+ logger.info(f"writing all content in: {work_dir.resolve()}")
31
+ Pipeline.from_configs(
32
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
33
+ indexer_config=LocalIndexerConfig(input_path=docs_path.resolve() / "multisimple"),
34
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
35
+ source_connection_config=LocalConnectionConfig(),
36
+ partitioner_config=PartitionerConfig(strategy="fast"),
37
+ chunker_config=ChunkerConfig(
38
+ chunking_strategy="by_title",
39
+ chunk_include_orig_elements=False,
40
+ chunk_max_characters=1500,
41
+ chunk_multipage_sections=True,
42
+ ),
43
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
44
+ destination_connection_config=ChromaConnectionConfig(
45
+ access_config=ChromaAccessConfig(settings=None, headers=None),
46
+ host="localhost",
47
+ port=8047,
48
+ collection_name=f"test-collection-{random.randint(1000, 9999)}",
49
+ tenant="default_tenant",
50
+ database="default_database",
51
+ ),
52
+ stager_config=ChromaUploadStagerConfig(),
53
+ uploader_config=ChromaUploaderConfig(batch_size=10),
54
+ ).run()
examples/couchbase.py ADDED
@@ -0,0 +1,55 @@
1
+ from pathlib import Path
2
+
3
+ from unstructured_ingest.interfaces import ProcessorConfig
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.pipeline.pipeline import Pipeline
6
+ from unstructured_ingest.processes.chunker import ChunkerConfig
7
+ from unstructured_ingest.processes.connectors.couchbase import (
8
+ CONNECTOR_TYPE,
9
+ CouchbaseAccessConfig,
10
+ CouchbaseConnectionConfig,
11
+ CouchbaseUploaderConfig,
12
+ CouchbaseUploadStagerConfig,
13
+ )
14
+ from unstructured_ingest.processes.connectors.local import (
15
+ LocalConnectionConfig,
16
+ LocalDownloaderConfig,
17
+ LocalIndexerConfig,
18
+ )
19
+ from unstructured_ingest.processes.embedder import EmbedderConfig
20
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
21
+
22
+ base_path = Path(__file__).parent.parent.parent.parent
23
+ docs_path = base_path / "example-docs"
24
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
+ output_path = work_dir / "output"
26
+ download_path = work_dir / "download"
27
+
28
+ if __name__ == "__main__":
29
+ logger.info(f"writing all content in: {work_dir.resolve()}")
30
+ Pipeline.from_configs(
31
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32
+ indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
33
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
34
+ source_connection_config=LocalConnectionConfig(),
35
+ partitioner_config=PartitionerConfig(strategy="fast"),
36
+ chunker_config=ChunkerConfig(
37
+ chunking_strategy="by_title",
38
+ chunk_include_orig_elements=False,
39
+ chunk_max_characters=1500,
40
+ chunk_multipage_sections=True,
41
+ ),
42
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
43
+ destination_connection_config=CouchbaseConnectionConfig(
44
+ access_config=CouchbaseAccessConfig(
45
+ connection_string="couchbase://localhost",
46
+ username="Administrator",
47
+ password="password",
48
+ ),
49
+ bucket="example_bucket",
50
+ scope="example_scope",
51
+ collection="example_collection",
52
+ ),
53
+ stager_config=CouchbaseUploadStagerConfig(),
54
+ uploader_config=CouchbaseUploaderConfig(batch_size=10),
55
+ ).run()
@@ -0,0 +1,55 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.chunker import ChunkerConfig
8
+ from unstructured_ingest.processes.connectors.databricks.volumes_native import (
9
+ CONNECTOR_TYPE,
10
+ DatabricksNativeVolumesAccessConfig,
11
+ DatabricksNativeVolumesConnectionConfig,
12
+ DatabricksNativeVolumesUploaderConfig,
13
+ )
14
+ from unstructured_ingest.processes.connectors.local import (
15
+ LocalConnectionConfig,
16
+ LocalDownloaderConfig,
17
+ LocalIndexerConfig,
18
+ )
19
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
20
+
21
+ base_path = Path(__file__).parent.parent.parent.parent
22
+ docs_path = base_path / "example-docs"
23
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
24
+ output_path = work_dir / "output"
25
+ download_path = work_dir / "download"
26
+
27
+ if __name__ == "__main__":
28
+ logger.info(f"writing all content in: {work_dir.resolve()}")
29
+ Pipeline.from_configs(
30
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
31
+ indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/fake-text.txt"),
32
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
33
+ source_connection_config=LocalConnectionConfig(),
34
+ partitioner_config=PartitionerConfig(strategy="fast"),
35
+ chunker_config=ChunkerConfig(
36
+ chunking_strategy="basic",
37
+ ),
38
+ embedder_config=None,
39
+ destination_connection_config=DatabricksNativeVolumesConnectionConfig(
40
+ access_config=DatabricksNativeVolumesAccessConfig(
41
+ client_id=os.environ["DATABRICKS_CLIENT_ID"],
42
+ client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
43
+ ),
44
+ host=os.environ["DATABRICKS_HOST"],
45
+ catalog=os.environ["DATABRICKS_CATALOG"],
46
+ volume=os.environ["DATABRICKS_VOLUME"],
47
+ volume_path=os.environ["DATABRICKS_VOLUME_PATH"],
48
+ ),
49
+ uploader_config=DatabricksNativeVolumesUploaderConfig(
50
+ overwrite=True,
51
+ catalog=os.environ["DATABRICKS_CATALOG"],
52
+ volume=os.environ["DATABRICKS_VOLUME"],
53
+ volume_path=os.environ["DATABRICKS_VOLUME_PATH"],
54
+ ),
55
+ ).run()
@@ -0,0 +1,53 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.chunker import ChunkerConfig
8
+ from unstructured_ingest.processes.connectors.databricks.volumes_native import (
9
+ CONNECTOR_TYPE,
10
+ DatabricksNativeVolumesAccessConfig,
11
+ DatabricksNativeVolumesConnectionConfig,
12
+ DatabricksNativeVolumesDownloaderConfig,
13
+ DatabricksNativeVolumesIndexerConfig,
14
+ )
15
+ from unstructured_ingest.processes.connectors.local import (
16
+ LocalUploaderConfig,
17
+ )
18
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
19
+
20
+ base_path = Path(__file__).parent.parent.parent.parent
21
+ docs_path = base_path / "example-docs"
22
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
23
+ output_path = work_dir / "output"
24
+ download_path = work_dir / "download"
25
+
26
+ if __name__ == "__main__":
27
+ logger.info(f"writing all content in: {work_dir.resolve()}")
28
+ Pipeline.from_configs(
29
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
30
+ indexer_config=DatabricksNativeVolumesIndexerConfig(
31
+ host=os.environ["DATABRICKS_HOST"],
32
+ catalog=os.environ["DATABRICKS_CATALOG"],
33
+ volume=os.environ["DATABRICKS_VOLUME"],
34
+ volume_path=os.environ["DATABRICKS_VOLUME_PATH"],
35
+ ),
36
+ downloader_config=DatabricksNativeVolumesDownloaderConfig(download_dir=download_path),
37
+ source_connection_config=DatabricksNativeVolumesConnectionConfig(
38
+ access_config=DatabricksNativeVolumesAccessConfig(
39
+ client_id=os.environ["DATABRICKS_CLIENT_ID"],
40
+ client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
41
+ ),
42
+ host=os.environ["DATABRICKS_HOST"],
43
+ catalog=os.environ["DATABRICKS_CATALOG"],
44
+ volume=os.environ["DATABRICKS_VOLUME"],
45
+ volume_path=os.environ["DATABRICKS_VOLUME_PATH"],
46
+ ),
47
+ partitioner_config=PartitionerConfig(strategy="fast"),
48
+ chunker_config=ChunkerConfig(
49
+ chunking_strategy="basic",
50
+ ),
51
+ embedder_config=None,
52
+ uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
53
+ ).run()
@@ -0,0 +1,45 @@
1
+ from pathlib import Path
2
+
3
+ from unstructured_ingest.interfaces import ProcessorConfig
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.pipeline.pipeline import Pipeline
6
+ from unstructured_ingest.processes.chunker import ChunkerConfig
7
+ from unstructured_ingest.processes.connectors.delta_table import (
8
+ CONNECTOR_TYPE,
9
+ DeltaTableAccessConfig,
10
+ DeltaTableConnectionConfig,
11
+ DeltaTableUploaderConfig,
12
+ DeltaTableUploadStagerConfig,
13
+ )
14
+ from unstructured_ingest.processes.connectors.local import (
15
+ LocalConnectionConfig,
16
+ LocalDownloaderConfig,
17
+ LocalIndexerConfig,
18
+ )
19
+ from unstructured_ingest.processes.embedder import EmbedderConfig
20
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
21
+
22
+ base_path = Path(__file__).parent.parent.parent.parent
23
+ docs_path = base_path / "example-docs"
24
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
+ output_path = work_dir / "output"
26
+ download_path = work_dir / "download"
27
+
28
+ if __name__ == "__main__":
29
+ logger.info(f"writing all content in: {work_dir.resolve()}")
30
+ Pipeline.from_configs(
31
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32
+ indexer_config=LocalIndexerConfig(
33
+ input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt",
34
+ ),
35
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
36
+ source_connection_config=LocalConnectionConfig(),
37
+ partitioner_config=PartitionerConfig(strategy="fast"),
38
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
39
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
40
+ destination_connection_config=DeltaTableConnectionConfig(
41
+ access_config=DeltaTableAccessConfig(), table_uri="example_uri"
42
+ ),
43
+ stager_config=DeltaTableUploadStagerConfig(),
44
+ uploader_config=DeltaTableUploaderConfig(),
45
+ ).run()
@@ -0,0 +1,36 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.connectors.discord import (
8
+ CONNECTOR_TYPE,
9
+ DiscordAccessConfig,
10
+ DiscordConnectionConfig,
11
+ DiscordDownloaderConfig,
12
+ DiscordIndexerConfig,
13
+ )
14
+ from unstructured_ingest.processes.connectors.local import LocalUploaderConfig
15
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
16
+
17
+ base_path = Path(__file__).parent.parent.parent.parent
18
+ docs_path = base_path / "example-docs"
19
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
20
+ output_path = work_dir / "output"
21
+ download_path = work_dir / "download"
22
+
23
+ if __name__ == "__main__":
24
+ logger.info(f"writing all content in: {work_dir.resolve()}")
25
+ Pipeline.from_configs(
26
+ context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
27
+ indexer_config=DiscordIndexerConfig(channels=os.environ["DISCORD_CHANNELS"].split(",")),
28
+ downloader_config=DiscordDownloaderConfig(limit=int(os.getenv("DISCORD_LIMIT", 100))),
29
+ source_connection_config=DiscordConnectionConfig(
30
+ access_config=DiscordAccessConfig(token=os.environ["DISCORD_TOKEN"])
31
+ ),
32
+ partitioner_config=PartitionerConfig(strategy="fast"),
33
+ # chunker_config=ChunkerConfig(chunking_strategy="by_title"),
34
+ # embedder_config=EmbedderConfig(embedding_provider="huggingface"),
35
+ uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
36
+ ).run()
@@ -0,0 +1,49 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.chunker import ChunkerConfig
8
+ from unstructured_ingest.processes.connectors.elasticsearch import (
9
+ CONNECTOR_TYPE,
10
+ ElasticsearchAccessConfig,
11
+ ElasticsearchConnectionConfig,
12
+ ElasticsearchUploaderConfig,
13
+ ElasticsearchUploadStagerConfig,
14
+ )
15
+ from unstructured_ingest.processes.connectors.local import (
16
+ LocalConnectionConfig,
17
+ LocalDownloaderConfig,
18
+ LocalIndexerConfig,
19
+ )
20
+ from unstructured_ingest.processes.embedder import EmbedderConfig
21
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
22
+
23
+ base_path = Path(__file__).parent.parent.parent.parent
24
+ docs_path = base_path / "example-docs"
25
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
26
+ output_path = work_dir / "output"
27
+ download_path = work_dir / "download"
28
+
29
+ if __name__ == "__main__":
30
+ logger.info(f"writing all content in: {work_dir.resolve()}")
31
+ index_name = "ingest-test-destination"
32
+ Pipeline.from_configs(
33
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
34
+ indexer_config=LocalIndexerConfig(
35
+ input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
36
+ ),
37
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
38
+ source_connection_config=LocalConnectionConfig(),
39
+ partitioner_config=PartitionerConfig(strategy="fast"),
40
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
41
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
42
+ destination_connection_config=ElasticsearchConnectionConfig(
43
+ access_config=ElasticsearchAccessConfig(password=os.getenv("ELASTIC_PASSWORD")),
44
+ username=os.getenv("ELASTIC_USERNAME"),
45
+ hosts=["http://localhost:9200"],
46
+ ),
47
+ uploader_config=ElasticsearchUploaderConfig(index_name=index_name),
48
+ stager_config=ElasticsearchUploadStagerConfig(index_name=index_name),
49
+ ).run()
@@ -0,0 +1,45 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.pipeline.pipeline import Pipeline
6
+ from unstructured_ingest.processes.chunker import ChunkerConfig
7
+ from unstructured_ingest.processes.connectors.google_drive import (
8
+ CONNECTOR_TYPE,
9
+ GoogleDriveAccessConfig,
10
+ GoogleDriveConnectionConfig,
11
+ GoogleDriveDownloaderConfig,
12
+ GoogleDriveIndexerConfig,
13
+ )
14
+ from unstructured_ingest.processes.connectors.local import (
15
+ LocalUploaderConfig,
16
+ )
17
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
18
+
19
+ base_path = Path(__file__).parent.parent.parent.parent
20
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
21
+ output_path = work_dir / "output"
22
+
23
+
24
+ if __name__ == "__main__":
25
+ Pipeline.from_configs(
26
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
27
+ # You'll need to set GOOGLE_DRIVE_SERVICE_KEY and GOOGLE_DRIVE_DRIVE_ID
28
+ # environment variable to run this example
29
+ source_connection_config=GoogleDriveConnectionConfig(
30
+ access_config=GoogleDriveAccessConfig(
31
+ service_account_key=os.environ.get("GOOGLE_DRIVE_SERVICE_KEY")
32
+ ),
33
+ drive_id=os.environ.get("GOOGLE_DRIVE_DRIVE_ID"),
34
+ ),
35
+ indexer_config=GoogleDriveIndexerConfig(
36
+ resursive=True,
37
+ ),
38
+ downloader_config=GoogleDriveDownloaderConfig(),
39
+ partitioner_config=PartitionerConfig(strategy="fast"),
40
+ chunker_config=ChunkerConfig(
41
+ chunking_strategy="basic",
42
+ ),
43
+ embedder_config=None,
44
+ uploader_config=LocalUploaderConfig(output_dir=output_path),
45
+ ).run()
examples/kdbai.py ADDED
@@ -0,0 +1,54 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from unstructured_ingest.interfaces import ProcessorConfig
5
+ from unstructured_ingest.logger import logger
6
+ from unstructured_ingest.pipeline.pipeline import Pipeline
7
+ from unstructured_ingest.processes.chunker import ChunkerConfig
8
+ from unstructured_ingest.processes.connectors.kdbai import (
9
+ CONNECTOR_TYPE,
10
+ KdbaiConnectionConfig,
11
+ KdbaiUploaderConfig,
12
+ KdbaiUploadStagerConfig,
13
+ )
14
+ from unstructured_ingest.processes.connectors.local import (
15
+ LocalConnectionConfig,
16
+ LocalDownloaderConfig,
17
+ LocalIndexerConfig,
18
+ )
19
+ from unstructured_ingest.processes.embedder import EmbedderConfig
20
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
21
+
22
+ base_path = Path(__file__).parent.parent.parent.parent
23
+ docs_path = base_path / "example-docs"
24
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
+ output_path = work_dir / "output"
26
+ download_path = work_dir / "download"
27
+ input_path = docs_path.resolve() / "pdf" / "fake-memo.pdf"
28
+
29
+ os.environ["KDBAI_API_KEY"] = "key"
30
+ os.environ["KDBAI_ENDPOINT"] = "http://localhost"
31
+ os.environ["KDBAI_DATABASE"] = "default"
32
+ os.environ["KDBAI_TABLE"] = "table"
33
+
34
+ if __name__ == "__main__":
35
+ logger.info(f"writing all content in: {work_dir.resolve()}")
36
+ logger.info(f"processing file(s): {input_path.resolve()}")
37
+ Pipeline.from_configs(
38
+ context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
39
+ indexer_config=LocalIndexerConfig(
40
+ input_path=docs_path.resolve() / "book-war-and-peace-1p.txt"
41
+ ),
42
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
43
+ source_connection_config=LocalConnectionConfig(),
44
+ partitioner_config=PartitionerConfig(strategy="fast"),
45
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
46
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
47
+ destination_connection_config=KdbaiConnectionConfig(
48
+ endpoint=os.environ["KDBAI_ENDPOINT"],
49
+ ),
50
+ stager_config=KdbaiUploadStagerConfig(),
51
+ uploader_config=KdbaiUploaderConfig(
52
+ database_name=os.environ["KDBAI_DATABASE"], table_name=os.environ["KDBAI_TABLE"]
53
+ ),
54
+ ).run()
examples/local.py ADDED
@@ -0,0 +1,36 @@
1
+ from pathlib import Path
2
+
3
+ from unstructured_ingest.interfaces import ProcessorConfig
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.pipeline.pipeline import Pipeline
6
+ from unstructured_ingest.processes.chunker import ChunkerConfig
7
+ from unstructured_ingest.processes.connectors.local import (
8
+ CONNECTOR_TYPE,
9
+ LocalConnectionConfig,
10
+ LocalDownloaderConfig,
11
+ LocalIndexerConfig,
12
+ LocalUploaderConfig,
13
+ )
14
+ from unstructured_ingest.processes.embedder import EmbedderConfig
15
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
16
+
17
+ base_path = Path(__file__).parent.parent.parent.parent
18
+ docs_path = base_path / "example-docs"
19
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
20
+ output_path = work_dir / "output"
21
+ download_path = work_dir / "download"
22
+
23
+ if __name__ == "__main__":
24
+ logger.info(f"writing all content in: {work_dir.resolve()}")
25
+ Pipeline.from_configs(
26
+ context=ProcessorConfig(work_dir=str(work_dir.resolve())),
27
+ indexer_config=LocalIndexerConfig(
28
+ input_path=str(docs_path.resolve()) + "/language-docs/UDHR_first_article_all.txt"
29
+ ),
30
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
31
+ source_connection_config=LocalConnectionConfig(),
32
+ partitioner_config=PartitionerConfig(strategy="fast"),
33
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
34
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
35
+ uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
36
+ ).run()
examples/milvus.py ADDED
@@ -0,0 +1,44 @@
1
+ from pathlib import Path
2
+
3
+ from unstructured_ingest.interfaces import ProcessorConfig
4
+ from unstructured_ingest.logger import logger
5
+ from unstructured_ingest.pipeline.pipeline import Pipeline
6
+ from unstructured_ingest.processes.chunker import ChunkerConfig
7
+ from unstructured_ingest.processes.connectors.local import (
8
+ LocalConnectionConfig,
9
+ LocalDownloaderConfig,
10
+ LocalIndexerConfig,
11
+ )
12
+ from unstructured_ingest.processes.connectors.milvus import (
13
+ CONNECTOR_TYPE,
14
+ MilvusConnectionConfig,
15
+ MilvusUploaderConfig,
16
+ MilvusUploadStagerConfig,
17
+ )
18
+ from unstructured_ingest.processes.embedder import EmbedderConfig
19
+ from unstructured_ingest.processes.partitioner import PartitionerConfig
20
+
21
+ base_path = Path(__file__).parent.parent.parent.parent
22
+ docs_path = base_path / "example-docs"
23
+ work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
24
+ output_path = work_dir / "output"
25
+ download_path = work_dir / "download"
26
+
27
+ if __name__ == "__main__":
28
+ logger.info(f"writing all content in: {work_dir.resolve()}")
29
+ Pipeline.from_configs(
30
+ context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
31
+ indexer_config=LocalIndexerConfig(
32
+ input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
33
+ ),
34
+ downloader_config=LocalDownloaderConfig(download_dir=download_path),
35
+ source_connection_config=LocalConnectionConfig(),
36
+ partitioner_config=PartitionerConfig(strategy="fast"),
37
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
38
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
39
+ destination_connection_config=MilvusConnectionConfig(
40
+ uri="http://localhost:19530", db_name="milvus"
41
+ ),
42
+ stager_config=MilvusUploadStagerConfig(),
43
+ uploader_config=MilvusUploaderConfig(collection_name="ingest_test"),
44
+ ).run()