unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +49 -0
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/processes/connectors/github.py +221 -0
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
  279. unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,408 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import logging
5
- import multiprocessing as mp
6
- import shutil
7
- from dataclasses import InitVar, dataclass, field
8
- from pathlib import Path
9
- from typing import Any
10
-
11
- from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
12
- from unstructured_ingest.v2.logger import logger, make_default_logger
13
- from unstructured_ingest.v2.otel import OtelHandler
14
- from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
15
- from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
16
- from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
17
- from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
18
- from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
19
- from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
20
- from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
21
- from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
22
- from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
23
- from unstructured_ingest.v2.pipeline.steps.upload import UploadStep
24
- from unstructured_ingest.v2.processes.chunker import ChunkerConfig
25
- from unstructured_ingest.v2.processes.connector_registry import (
26
- ConnectionConfig,
27
- DownloaderConfigT,
28
- IndexerConfigT,
29
- UploaderConfigT,
30
- UploadStagerConfigT,
31
- destination_registry,
32
- source_registry,
33
- )
34
- from unstructured_ingest.v2.processes.connectors.local import LocalUploader
35
- from unstructured_ingest.v2.processes.embedder import EmbedderConfig
36
- from unstructured_ingest.v2.processes.filter import FiltererConfig
37
- from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
38
-
39
-
40
- class PipelineError(Exception):
41
- pass
42
-
43
-
44
- @dataclass
45
- class Pipeline:
46
- context: ProcessorConfig
47
-
48
- indexer: InitVar[IndexerT]
49
- indexer_step: IndexStep = field(init=False)
50
-
51
- downloader: InitVar[DownloaderT]
52
- downloader_step: DownloadStep = field(init=False)
53
-
54
- partitioner: InitVar[Partitioner]
55
- partitioner_step: PartitionStep = field(init=False)
56
-
57
- chunker: InitVar[Chunker | None] = None
58
- chunker_step: ChunkStep | None = field(init=False, default=None)
59
-
60
- embedder: InitVar[Embedder | None] = None
61
- embedder_step: EmbedStep | None = field(init=False, default=None)
62
-
63
- stager: InitVar[UploadStager | None] = None
64
- stager_step: UploadStageStep | None = field(init=False, default=None)
65
-
66
- uploader: InitVar[Uploader] = field(default=LocalUploader())
67
- uploader_step: UploadStep | None = field(init=False, default=None)
68
-
69
- uncompress_step: UncompressStep | None = field(init=False, default=None)
70
-
71
- filterer: InitVar[Filterer | None] = None
72
- filter_step: FilterStep | None = field(init=False, default=None)
73
-
74
- def __post_init__(
75
- self,
76
- indexer: IndexerT,
77
- downloader: DownloaderT,
78
- partitioner: Partitioner,
79
- chunker: Chunker | None = None,
80
- embedder: Embedder | None = None,
81
- stager: UploadStager | None = None,
82
- uploader: Uploader | None = None,
83
- filterer: Filterer | None = None,
84
- ):
85
- make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
86
- otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
87
- otel_handler.init_trace()
88
- self.indexer_step = IndexStep(process=indexer, context=self.context)
89
- self.downloader_step = DownloadStep(process=downloader, context=self.context)
90
- self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
91
- self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
92
- self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
93
-
94
- self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
95
-
96
- self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
97
- self.uploader_step = UploadStep(process=uploader, context=self.context)
98
- if self.context.uncompress:
99
- process = Uncompressor()
100
- self.uncompress_step = UncompressStep(process=process, context=self.context)
101
-
102
- self.check_destination_connector()
103
-
104
- def check_destination_connector(self):
105
- # Make sure that if the set destination connector expects a stager, one is also set
106
- if not self.uploader_step:
107
- return
108
- uploader_connector_type = self.uploader_step.process.connector_type
109
- registry_entry = destination_registry[uploader_connector_type]
110
- if registry_entry.upload_stager and self.stager_step is None:
111
- try:
112
- self.stager_step = UploadStageStep(
113
- process=registry_entry.upload_stager(), context=self.context
114
- )
115
- return
116
- except Exception as e:
117
- logger.debug(f"failed to instantiate required stager on user's behalf: {e}")
118
- raise ValueError(
119
- f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
120
- f"expects a stager of type {registry_entry.upload_stager.__name__} "
121
- f"but one was not set"
122
- )
123
-
124
- def cleanup(self):
125
- if self.context.delete_cache and Path(self.context.work_dir).exists():
126
- logger.info(f"deleting cache directory: {self.context.work_dir}")
127
- shutil.rmtree(self.context.work_dir)
128
-
129
- def log_statuses(self):
130
- if status := self.context.status:
131
- logger.error(f"{len(status)} failed documents:")
132
- for k, v in status.items():
133
- for kk, vv in v.items():
134
- logger.error(f"{k}: [{kk}] {vv}")
135
-
136
- def _run_initialization(self):
137
- failures = {}
138
- init_kwargs = {}
139
- for step in self._get_ordered_steps():
140
- try:
141
- step.process.init(**init_kwargs)
142
- step.process.precheck()
143
- # Make sure embedder dimensions available for downstream steps
144
- if isinstance(step.process, Embedder):
145
- embed_dimensions = step.process.config.get_embedder().dimension
146
- init_kwargs["vector_length"] = embed_dimensions
147
-
148
- except Exception as e:
149
- failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
150
- if failures:
151
- for k, v in failures.items():
152
- logger.error(f"Step initialization failure: {k}: {v}")
153
- raise PipelineError("Initialization failed")
154
-
155
- def run(self):
156
- otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
157
- try:
158
- with otel_handler.get_tracer().start_as_current_span(
159
- "ingest process", record_exception=True
160
- ):
161
- self._run_initialization()
162
- self._run()
163
- finally:
164
- self.log_statuses()
165
- self.cleanup()
166
- if self.context.status:
167
- raise PipelineError("Pipeline did not run successfully")
168
-
169
- def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
170
- if not results:
171
- return None
172
- results = [r for r in results if r]
173
- flat = []
174
- for r in results:
175
- if isinstance(r, list):
176
- flat.extend(r)
177
- else:
178
- flat.append(r)
179
- final = [f for f in flat if f]
180
- return final or None
181
-
182
- def _get_ordered_steps(self) -> list[PipelineStep]:
183
- steps = [self.indexer_step, self.downloader_step]
184
- if self.uncompress_step:
185
- steps.append(self.uncompress_step)
186
- steps.append(self.partitioner_step)
187
- if self.chunker_step:
188
- steps.append(self.chunker_step)
189
- if self.embedder_step:
190
- steps.append(self.embedder_step)
191
- if self.stager_step:
192
- steps.append(self.stager_step)
193
- steps.append(self.uploader_step)
194
- return steps
195
-
196
- def apply_filter(self, records: list[dict]) -> list[dict]:
197
- if not self.filter_step:
198
- return records
199
- data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
200
- filtered_data = self.filter_step(data_to_filter)
201
- filtered_data = [f for f in filtered_data if f is not None]
202
- filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
203
- filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
204
- return filtered_records
205
-
206
- def get_indices(self) -> list[dict]:
207
- if self.indexer_step.process.is_async():
208
-
209
- async def run_async():
210
- output = []
211
- async for i in self.indexer_step.run_async():
212
- output.append(i)
213
- return output
214
-
215
- indices = asyncio.run(run_async())
216
- else:
217
- indices = self.indexer_step.run()
218
- indices_inputs = [{"file_data_path": i} for i in indices]
219
- return indices_inputs
220
-
221
- def _run(self):
222
- logger.info(
223
- f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
224
- )
225
- if self.context.mp_supported:
226
- manager = mp.Manager()
227
- self.context.status = manager.dict()
228
- else:
229
- self.context.status = {}
230
-
231
- # Index into data source
232
- indices_inputs = self.get_indices()
233
- if not indices_inputs:
234
- logger.info("No files to process after indexer, exiting")
235
- return
236
-
237
- # Initial filtering on indexed content
238
- indices_inputs = self.apply_filter(records=indices_inputs)
239
- if not indices_inputs:
240
- logger.info("No files to process after filtering indexed content, exiting")
241
- return
242
-
243
- # Download associated content to local file system
244
- downloaded_data = self.downloader_step(indices_inputs)
245
- downloaded_data = self.clean_results(results=downloaded_data)
246
- if not downloaded_data:
247
- logger.info("No files to process after downloader, exiting")
248
- return
249
-
250
- # Post download filtering
251
- downloaded_data = self.apply_filter(records=downloaded_data)
252
- if not downloaded_data:
253
- logger.info("No files to process after filtering downloaded content, exiting")
254
- return
255
-
256
- # Run uncompress if available
257
- if self.uncompress_step:
258
- downloaded_data = self.uncompress_step(downloaded_data)
259
- # Flatten list of lists
260
- downloaded_data = self.clean_results(results=downloaded_data)
261
-
262
- # Post uncompress filtering
263
- downloaded_data = self.apply_filter(records=downloaded_data)
264
- if not downloaded_data:
265
- logger.info("No files to process after filtering uncompressed content, exiting")
266
- return
267
-
268
- if not downloaded_data or self.context.download_only:
269
- return
270
-
271
- # Partition content
272
- elements = self.partitioner_step(downloaded_data)
273
- elements = self.clean_results(results=elements)
274
- # Download data non longer needed, delete if possible
275
- self.downloader_step.delete_cache()
276
- elements = self.clean_results(results=elements)
277
- if not elements:
278
- logger.info("No files to process after partitioning, exiting")
279
- return
280
-
281
- # Run element specific modifiers
282
- last_step = self.partitioner_step
283
- for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
284
- elements = step(elements)
285
- elements = self.clean_results(results=elements)
286
- # Delete data from previous step if possible since no longer needed
287
- last_step.delete_cache()
288
- last_step = step
289
- if not elements:
290
- logger.info(f"no files to process after {step.__class__.__name__}, exiting")
291
- return
292
-
293
- # Upload the final result
294
- self.uploader_step(iterable=elements)
295
- last_step.delete_cache()
296
-
297
- def __str__(self):
298
- s = [str(self.indexer_step)]
299
- if filter_step := self.filter_step:
300
- s.append(str(filter_step))
301
- s.append(str(self.downloader_step))
302
- if filter_step := self.filter_step:
303
- s.append(str(filter_step))
304
- if uncompress_step := self.uncompress_step:
305
- s.extend([str(uncompress_step), str(filter_step)])
306
- s.append(str(self.partitioner_step))
307
- if chunker_step := self.chunker_step:
308
- s.append(str(chunker_step))
309
- if embedder_step := self.embedder_step:
310
- s.append(str(embedder_step))
311
- if stager_step := self.stager_step:
312
- s.append(str(stager_step))
313
- s.append(str(self.uploader_step))
314
- return " -> ".join(s)
315
-
316
- @classmethod
317
- def from_configs(
318
- cls,
319
- context: ProcessorConfig,
320
- indexer_config: IndexerConfigT,
321
- downloader_config: DownloaderConfigT,
322
- source_connection_config: ConnectionConfig,
323
- partitioner_config: PartitionerConfig,
324
- filterer_config: FiltererConfig | None = None,
325
- chunker_config: ChunkerConfig | None = None,
326
- embedder_config: EmbedderConfig | None = None,
327
- destination_connection_config: ConnectionConfig | None = None,
328
- stager_config: UploadStagerConfigT | None = None,
329
- uploader_config: UploaderConfigT | None = None,
330
- ) -> "Pipeline":
331
- # Get registry key based on indexer config
332
- source_entry = {
333
- k: v
334
- for k, v in source_registry.items()
335
- if type(indexer_config) is v.indexer_config
336
- and type(downloader_config) is v.downloader_config
337
- and type(source_connection_config) is v.connection_config
338
- }
339
- if len(source_entry) > 1:
340
- raise ValueError(
341
- f"multiple entries found matching provided indexer, "
342
- f"downloader and connection configs: {source_entry}"
343
- )
344
- if len(source_entry) != 1:
345
- raise ValueError(
346
- "no entry found in source registry with matching indexer, "
347
- "downloader and connection configs"
348
- )
349
- source = list(source_entry.values())[0]
350
- pipeline_kwargs = {
351
- "context": context,
352
- "indexer": source.indexer(
353
- index_config=indexer_config, connection_config=source_connection_config
354
- ),
355
- "downloader": source.downloader(
356
- download_config=downloader_config, connection_config=source_connection_config
357
- ),
358
- "partitioner": Partitioner(config=partitioner_config),
359
- }
360
- if filterer_config:
361
- pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
362
- if chunker_config:
363
- pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
364
- if embedder_config:
365
- pipeline_kwargs["embedder"] = Embedder(config=embedder_config)
366
- if not uploader_config:
367
- return Pipeline(**pipeline_kwargs)
368
-
369
- destination_entry = {
370
- k: v
371
- for k, v in destination_registry.items()
372
- if isinstance(uploader_config, v.uploader_config)
373
- }
374
- if destination_connection_config:
375
- destination_entry = {
376
- k: v
377
- for k, v in destination_entry.items()
378
- if isinstance(destination_connection_config, v.connection_config)
379
- }
380
- if stager_config:
381
- destination_entry = {
382
- k: v
383
- for k, v in destination_entry.items()
384
- if isinstance(stager_config, v.upload_stager_config)
385
- }
386
-
387
- if len(destination_entry) > 1:
388
- raise ValueError(
389
- f"multiple entries found matching provided uploader, "
390
- f"stager and connection configs: {destination_entry}"
391
- )
392
- if len(destination_entry) != 1:
393
- raise ValueError(
394
- "no entry found in destination registry with matching uploader, "
395
- "stager and connection configs"
396
- )
397
-
398
- destination = list(destination_entry.values())[0]
399
- if stager_config:
400
- pipeline_kwargs["stager"] = destination.upload_stager(
401
- upload_stager_config=stager_config
402
- )
403
- if uploader_config:
404
- uploader_kwargs = {"upload_config": uploader_config}
405
- if destination_connection_config:
406
- uploader_kwargs["connection_config"] = destination_connection_config
407
- pipeline_kwargs["uploader"] = destination.uploader(**uploader_kwargs)
408
- return cls(**pipeline_kwargs)
File without changes
@@ -1,10 +0,0 @@
1
- CREATE TABLE elements (
2
- id STRING NOT NULL PRIMARY KEY,
3
- record_id STRING NOT NULL,
4
- element_id STRING NOT NULL,
5
- text STRING,
6
- embeddings ARRAY<FLOAT>,
7
- type STRING,
8
- metadata VARIANT
9
- );
10
-
@@ -1,23 +0,0 @@
1
- {
2
- "properties": [
3
- {
4
- "dataType": [
5
- "text"
6
- ],
7
- "indexFilterable": true,
8
- "indexSearchable": true,
9
- "name": "record_id",
10
- "tokenization": "word"
11
- },
12
- {
13
- "dataType": [
14
- "text"
15
- ],
16
- "indexFilterable": true,
17
- "indexSearchable": true,
18
- "name": "text",
19
- "tokenization": "word"
20
- }
21
- ],
22
- "vectorizer": "none"
23
- }
@@ -1,32 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from typing import Optional
3
-
4
- from htmlBuilder.tags import HtmlTag
5
-
6
-
7
- class FromJSONMixin(ABC):
8
- @classmethod
9
- @abstractmethod
10
- def from_dict(cls, data: dict):
11
- pass
12
-
13
-
14
- class GetHTMLMixin(ABC):
15
- @abstractmethod
16
- def get_html(self) -> Optional[HtmlTag]:
17
- pass
18
-
19
-
20
- class BlockBase(FromJSONMixin, GetHTMLMixin):
21
- @staticmethod
22
- @abstractmethod
23
- def can_have_children() -> bool:
24
- pass
25
-
26
-
27
- class DBPropertyBase(FromJSONMixin):
28
- pass
29
-
30
-
31
- class DBCellBase(FromJSONMixin, GetHTMLMixin):
32
- pass
@@ -1,96 +0,0 @@
1
- # https://developers.notion.com/reference/page
2
- from dataclasses import dataclass
3
- from typing import Optional
4
-
5
- from htmlBuilder.tags import HtmlTag
6
-
7
- from unstructured_ingest.v2.processes.connectors.notion.interfaces import (
8
- BlockBase,
9
- FromJSONMixin,
10
- GetHTMLMixin,
11
- )
12
- from unstructured_ingest.v2.processes.connectors.notion.types import blocks
13
- from unstructured_ingest.v2.processes.connectors.notion.types.parent import Parent
14
- from unstructured_ingest.v2.processes.connectors.notion.types.user import PartialUser
15
-
16
- block_type_mapping = {
17
- "bookmark": blocks.Bookmark,
18
- "breadcrumb": blocks.Breadcrumb,
19
- "bulleted_list_item": blocks.BulletedListItem,
20
- "callout": blocks.Callout,
21
- "child_database": blocks.ChildDatabase,
22
- "child_page": blocks.ChildPage,
23
- "code": blocks.Code,
24
- "column": blocks.Column,
25
- "column_list": blocks.ColumnList,
26
- "divider": blocks.Divider,
27
- "heading_1": blocks.Heading,
28
- "heading_2": blocks.Heading,
29
- "heading_3": blocks.Heading,
30
- "embed": blocks.Embed,
31
- "equation": blocks.Equation,
32
- "file": blocks.File,
33
- "image": blocks.Image,
34
- "link_preview": blocks.LinkPreview,
35
- "link_to_page": blocks.LinkToPage,
36
- "numbered_list_item": blocks.NumberedListItem,
37
- "paragraph": blocks.Paragraph,
38
- "pdf": blocks.PDF,
39
- "quote": blocks.Quote,
40
- "synced_block": blocks.SyncBlock,
41
- "table": blocks.Table,
42
- "table_of_contents": blocks.TableOfContents,
43
- "table_row": blocks.TableRow,
44
- "template": blocks.Template,
45
- "to_do": blocks.ToDo,
46
- "toggle": blocks.Toggle,
47
- "unsupported": blocks.Unsupported,
48
- "video": blocks.Video,
49
- }
50
-
51
-
52
- @dataclass
53
- class Block(FromJSONMixin, GetHTMLMixin):
54
- id: str
55
- type: str
56
- created_time: str
57
- created_by: PartialUser
58
- last_edited_time: str
59
- last_edited_by: PartialUser
60
- archived: bool
61
- in_trash: bool
62
- has_children: bool
63
- parent: Parent
64
- block: BlockBase
65
- object: str = "block"
66
- request_id: Optional[str] = None
67
-
68
- def __repr__(self):
69
- return f"{self.__class__.__name__}(id={self.id}, type={self.type})"
70
-
71
- @classmethod
72
- def from_dict(cls, data: dict):
73
- t = data["type"]
74
- block_data = data.pop(t)
75
- created_by = data.pop("created_by")
76
- last_edited_by = data.pop("last_edited_by")
77
- parent = data.pop("parent")
78
- try:
79
- block = cls(
80
- created_by=PartialUser.from_dict(created_by),
81
- last_edited_by=PartialUser.from_dict(last_edited_by),
82
- parent=Parent.from_dict(parent),
83
- block=block_type_mapping[t].from_dict(block_data), # type: ignore
84
- **data,
85
- )
86
- except KeyError as ke:
87
- raise KeyError(f"failed to map to associated block type -> {t}: {block_data}") from ke
88
- except TypeError as te:
89
- raise TypeError(f"failed to map to associated block type -> {t}: {block_data}") from te
90
-
91
- return block
92
-
93
- def get_html(self) -> Optional[HtmlTag]:
94
- if self.block:
95
- return self.block.get_html()
96
- return None
@@ -1,63 +0,0 @@
1
- from .bookmark import Bookmark
2
- from .breadcrumb import Breadcrumb
3
- from .bulleted_list_item import BulletedListItem
4
- from .callout import Callout
5
- from .child_database import ChildDatabase
6
- from .child_page import ChildPage
7
- from .code import Code
8
- from .column_list import Column, ColumnList
9
- from .divider import Divider
10
- from .embed import Embed
11
- from .equation import Equation
12
- from .file import File
13
- from .heading import Heading
14
- from .image import Image
15
- from .link_preview import LinkPreview
16
- from .link_to_page import LinkToPage
17
- from .numbered_list import NumberedListItem
18
- from .paragraph import Paragraph
19
- from .pdf import PDF
20
- from .quote import Quote
21
- from .synced_block import DuplicateSyncedBlock, OriginalSyncedBlock, SyncBlock
22
- from .table import Table, TableRow
23
- from .table_of_contents import TableOfContents
24
- from .template import Template
25
- from .todo import ToDo
26
- from .toggle import Toggle
27
- from .unsupported import Unsupported
28
- from .video import Video
29
-
30
- __all__ = [
31
- "Bookmark",
32
- "Breadcrumb",
33
- "BulletedListItem",
34
- "Callout",
35
- "ChildDatabase",
36
- "ChildPage",
37
- "Code",
38
- "Column",
39
- "ColumnList",
40
- "Divider",
41
- "Embed",
42
- "Equation",
43
- "File",
44
- "Heading",
45
- "Image",
46
- "LinkPreview",
47
- "LinkToPage",
48
- "NumberedListItem",
49
- "Paragraph",
50
- "PDF",
51
- "Quote",
52
- "SyncBlock",
53
- "OriginalSyncedBlock",
54
- "DuplicateSyncedBlock",
55
- "Table",
56
- "TableRow",
57
- "TableOfContents",
58
- "Template",
59
- "ToDo",
60
- "Toggle",
61
- "Unsupported",
62
- "Video",
63
- ]
@@ -1,21 +0,0 @@
1
- # https://developers.notion.com/reference/block#breadcrumb
2
- from dataclasses import dataclass
3
- from typing import Optional
4
-
5
- from htmlBuilder.tags import HtmlTag
6
-
7
- from unstructured_ingest.connector.notion.interfaces import BlockBase
8
-
9
-
10
- @dataclass
11
- class Breadcrumb(BlockBase):
12
- @staticmethod
13
- def can_have_children() -> bool:
14
- return False
15
-
16
- @classmethod
17
- def from_dict(cls, data: dict):
18
- return cls()
19
-
20
- def get_html(self) -> Optional[HtmlTag]:
21
- pass