unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
  279. unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,85 +0,0 @@
1
- import logging
2
- from dataclasses import dataclass
3
-
4
- import click
5
-
6
- from unstructured_ingest.v2.cli.base.cmd import BaseCmd
7
- from unstructured_ingest.v2.cli.utils.click import Dict, conform_click_options
8
- from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model
9
- from unstructured_ingest.v2.logger import logger
10
- from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
11
-
12
-
13
- @dataclass
14
- class DestCmd(BaseCmd):
15
- registry_entry: DestinationRegistryEntry
16
-
17
- def get_registry_options(self):
18
- options = []
19
- configs = [
20
- config
21
- for config in [
22
- self.registry_entry.uploader_config,
23
- self.registry_entry.upload_stager_config,
24
- self.registry_entry.connection_config,
25
- ]
26
- if config
27
- ]
28
- for config in configs:
29
- options.extend(options_from_base_model(model=config))
30
- options = self.consolidate_options(options=options)
31
- return options
32
-
33
- def cmd(self, ctx: click.Context, **options) -> None:
34
- logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
35
- if not ctx.parent:
36
- raise click.ClickException("destination command called without a parent")
37
- if not ctx.parent.info_name:
38
- raise click.ClickException("parent command missing info name")
39
- source_cmd = ctx.parent.info_name.replace("-", "_")
40
- source_options: dict = ctx.parent.params if ctx.parent else {}
41
- conform_click_options(options)
42
- try:
43
- pipeline = self.get_pipeline(
44
- src=source_cmd,
45
- source_options=source_options,
46
- dest=self.cmd_name,
47
- destination_options=options,
48
- )
49
- pipeline.run()
50
- except Exception as e:
51
- logger.error(f"failed to run destination command {self.cmd_name}: {e}", exc_info=True)
52
- raise click.ClickException(str(e)) from e
53
-
54
- def get_cmd(self) -> click.Command:
55
- # Dynamically create the command without the use of click decorators
56
- fn = self.cmd
57
- fn = click.pass_context(fn)
58
- cmd = click.command(fn)
59
- if not isinstance(cmd, click.core.Command):
60
- raise ValueError(f"generated command was not of expected type Command: {type(cmd)}")
61
- cmd.name = self.cli_cmd_name
62
- cmd.short_help = "v2"
63
- cmd.invoke_without_command = True
64
- self.add_options(cmd)
65
- cmd.params.append(
66
- click.Option(
67
- ["--custom-stager"],
68
- required=False,
69
- type=str,
70
- default=None,
71
- help="Pass a pointer to a custom upload stager to use, "
72
- "must be in format '<module>:<attribute>'",
73
- )
74
- )
75
- cmd.params.append(
76
- click.Option(
77
- ["--custom-stager-config-kwargs"],
78
- required=False,
79
- type=Dict(),
80
- default=None,
81
- help="Any kwargs to instantiate the configuration "
82
- "associated with the customer stager",
83
- )
84
- )
85
- return cmd
@@ -1,85 +0,0 @@
1
- import logging
2
- from dataclasses import dataclass, field
3
- from typing import Any
4
-
5
- import click
6
- from pydantic import BaseModel
7
-
8
- from unstructured_ingest.v2.cli.base.cmd import BaseCmd
9
- from unstructured_ingest.v2.cli.utils.click import Group, conform_click_options
10
- from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model
11
- from unstructured_ingest.v2.interfaces import ProcessorConfig
12
- from unstructured_ingest.v2.logger import logger
13
- from unstructured_ingest.v2.processes import (
14
- ChunkerConfig,
15
- EmbedderConfig,
16
- FiltererConfig,
17
- PartitionerConfig,
18
- )
19
- from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
20
-
21
-
22
- @dataclass
23
- class SrcCmd(BaseCmd):
24
- registry_entry: SourceRegistryEntry
25
- default_configs: list[BaseModel] = field(
26
- default_factory=lambda: [
27
- ProcessorConfig,
28
- PartitionerConfig,
29
- EmbedderConfig,
30
- FiltererConfig,
31
- ChunkerConfig,
32
- ]
33
- )
34
-
35
- def get_registry_options(self):
36
- options = []
37
- configs = [
38
- config
39
- for config in [
40
- self.registry_entry.connection_config,
41
- self.registry_entry.indexer_config,
42
- self.registry_entry.downloader_config,
43
- ]
44
- if config
45
- ]
46
- for config in configs:
47
- options.extend(options_from_base_model(model=config))
48
- options = self.consolidate_options(options=options)
49
- return options
50
-
51
- def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
52
- if ctx.invoked_subcommand:
53
- return
54
-
55
- conform_click_options(options)
56
- logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
57
- try:
58
- pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
59
- pipeline.run()
60
- except Exception as e:
61
- logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
62
- raise click.ClickException(str(e)) from e
63
-
64
- def get_cmd(self) -> click.Group:
65
- # Dynamically create the command without the use of click decorators
66
- fn = self.cmd
67
- fn = click.pass_context(fn)
68
- cmd = click.group(fn, cls=Group)
69
- if not isinstance(cmd, click.core.Group):
70
- raise ValueError(f"generated src command was not of expected type Group: {type(cmd)}")
71
- cmd.name = self.cli_cmd_name
72
- cmd.short_help = "v2"
73
- cmd.invoke_without_command = True
74
- self.add_options(cmd)
75
-
76
- # TODO remove after v1 no longer supported
77
- cmd.params.append(
78
- click.Option(
79
- ["--output-dir"],
80
- required=False,
81
- type=str,
82
- help="Local path to write partitioned output to",
83
- )
84
- )
85
- return cmd
@@ -1,24 +0,0 @@
1
- import click
2
-
3
- from unstructured_ingest.v2.cli.cmds import dest, src
4
-
5
-
6
- @click.group()
7
- def ingest():
8
- pass
9
-
10
-
11
- def get_cmd() -> click.Command:
12
- """Construct and return a Click command object representing the main command for the CLI.
13
-
14
- This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
15
- to the main command as nested subcommands.
16
- """
17
- cmd = ingest
18
- # Add all subcommands
19
- for src_subcommand in src:
20
- # Add all destination subcommands
21
- for dest_subcommand in dest:
22
- src_subcommand.add_command(dest_subcommand)
23
- cmd.add_command(src_subcommand)
24
- return cmd
File without changes
@@ -1,126 +0,0 @@
1
- import ast
2
- import json
3
- import os
4
- from logging import Formatter, Logger, StreamHandler, getLevelName, getLogger
5
- from typing import Any, Callable
6
-
7
- log_level = os.getenv("INGEST_LOG_LEVEL", "INFO")
8
- LOGGER_NAME = "unstructured_ingest.v2"
9
-
10
-
11
- def default_is_data_sensitive(k: str, v: Any) -> bool:
12
- sensitive_fields = [
13
- "account_name",
14
- "client_id",
15
- ]
16
- sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
17
- return (
18
- v
19
- and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
20
- or k.lower() in sensitive_fields
21
- )
22
-
23
-
24
- def hide_sensitive_fields(
25
- data: dict, is_sensitive_fn: Callable[[str, Any], bool] = default_is_data_sensitive
26
- ) -> dict:
27
- """
28
- Will recursively look through every k, v pair in this dict and any nested ones and run
29
- is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
30
- any string value can be parsed as valid json and process that dict as well and replace
31
- the original string with the json.dumps() version of the redacted dict.
32
- """
33
- new_data = data.copy()
34
- for k, v in new_data.items():
35
- if is_sensitive_fn(k, v):
36
- new_data[k] = "*******"
37
- if isinstance(v, dict):
38
- new_data[k] = hide_sensitive_fields(v)
39
- if isinstance(v, str):
40
- # Need to take into account strings generated via json.dumps() or simply printing a dict
41
- try:
42
- json_data = json.loads(v)
43
- if isinstance(json_data, dict):
44
- updated_data = hide_sensitive_fields(json_data)
45
- new_data[k] = json.dumps(updated_data)
46
- except json.JSONDecodeError:
47
- pass
48
-
49
- return new_data
50
-
51
-
52
- def redact_jsons(s: str) -> str:
53
- """
54
- Takes in a generic string and pulls out all valid json content. Leverages
55
- hide_sensitive_fields() to redact any sensitive information and replaces the
56
- original json with the new redacted format. There can be any number of valid
57
- jsons in a generic string and this will work. Having extra '{' without a
58
- closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
59
-
60
- """
61
- chars = list(s)
62
- if "{" not in chars:
63
- return s
64
- i = 0
65
- jsons = []
66
- i = 0
67
- while i < len(chars):
68
- char = chars[i]
69
- if char == "{":
70
- stack = [char]
71
- current = [char]
72
- while len(stack) != 0 and i < len(chars):
73
- i += 1
74
- char = chars[i]
75
- current.append(char)
76
- if char == "{":
77
- stack.append(char)
78
- if char == "}":
79
- stack.pop(-1)
80
- jsons.append("".join(current))
81
- continue
82
- i += 1
83
- for j in jsons:
84
- try:
85
- formatted_j = json.dumps(json.loads(j))
86
- except json.JSONDecodeError:
87
- lit = ast.literal_eval(j)
88
- formatted_j = json.dumps(lit)
89
- hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
90
- s = s.replace(j, hidden_j)
91
- return s
92
-
93
-
94
- class SensitiveFormatter(Formatter):
95
- def format(self, record):
96
- s = super().format(record=record)
97
- try:
98
- return redact_jsons(s)
99
- except Exception:
100
- return f"Failed to redact: {s}"
101
-
102
-
103
- def remove_root_handlers(logger: Logger) -> None:
104
- # NOTE(robinson): in some environments such as Google Colab, there is a root handler
105
- # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
106
- # Removing these when they exist prevents this behavior
107
- if logger.root.hasHandlers():
108
- for handler in logger.root.handlers:
109
- logger.root.removeHandler(handler)
110
-
111
-
112
- def make_default_logger(level: int) -> Logger:
113
- """Return a custom logger."""
114
- logger = getLogger(LOGGER_NAME)
115
- handler = StreamHandler()
116
- handler.name = "ingest_log_handler"
117
- formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
118
- handler.setFormatter(formatter)
119
- if handler.name not in [h.name for h in logger.handlers]:
120
- logger.addHandler(handler)
121
- logger.setLevel(level)
122
- remove_root_handlers(logger)
123
- return logger
124
-
125
-
126
- logger = make_default_logger(level=getLevelName(log_level.upper()))
@@ -1,11 +0,0 @@
1
- #!/usr/bin/env python3
2
- from unstructured_ingest.v2.cli.cli import get_cmd
3
-
4
-
5
- def main():
6
- ingest_cmd = get_cmd()
7
- ingest_cmd()
8
-
9
-
10
- if __name__ == "__main__":
11
- main()
File without changes
@@ -1,211 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import logging
5
- import multiprocessing as mp
6
- import shutil
7
- from abc import ABC, abstractmethod
8
- from concurrent.futures import ThreadPoolExecutor
9
- from dataclasses import dataclass
10
- from pathlib import Path
11
- from typing import Any, Awaitable, Callable, Optional, TypeVar
12
-
13
- from tqdm import tqdm
14
- from tqdm.asyncio import tqdm as tqdm_asyncio
15
-
16
- from unstructured_ingest.v2.interfaces import BaseProcess, ProcessorConfig, Uploader
17
- from unstructured_ingest.v2.logger import logger, make_default_logger
18
- from unstructured_ingest.v2.otel import OtelHandler
19
- from unstructured_ingest.v2.pipeline.otel import instrument
20
-
21
- BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess)
22
- iterable_input = list[dict[str, Any]]
23
-
24
-
25
- @dataclass
26
- class PipelineStep(ABC):
27
- process: BaseProcessT
28
- context: ProcessorConfig
29
- identifier: str
30
-
31
- def __str__(self):
32
- return self.identifier
33
-
34
- def process_serially(self, iterable: iterable_input) -> Any:
35
- logger.info("processing content serially")
36
- if iterable:
37
- if len(iterable) == 1:
38
- return [self.run(**iterable[0])]
39
- if self.context.tqdm:
40
- return [self.run(**it) for it in tqdm(iterable, desc=self.identifier)]
41
- return [self.run(**it) for it in iterable]
42
- return [self.run()]
43
-
44
- async def _process_async(self, iterable: iterable_input) -> Any:
45
- if iterable:
46
- if len(iterable) == 1:
47
- return [await self.run_async(**iterable[0])]
48
- if self.context.tqdm:
49
- return await tqdm_asyncio.gather(
50
- *[self.run_async(**i) for i in iterable], desc=self.identifier
51
- )
52
- return await asyncio.gather(*[self.run_async(**i) for i in iterable])
53
- return [await self.run_async()]
54
-
55
- def process_async(self, iterable: iterable_input) -> Any:
56
- logger.info("processing content async")
57
- return self.asyncio_run(fn=self._process_async, iterable=iterable)
58
-
59
- def asyncio_run(
60
- self, fn: Callable[[Any, Any], Awaitable[Any]], *args: Any, **kwargs: Any
61
- ) -> Any:
62
- current_loop = asyncio._get_running_loop()
63
- if current_loop is None:
64
- return asyncio.run(fn(*args, **kwargs))
65
- with ThreadPoolExecutor(thread_name_prefix="asyncio") as thread_pool:
66
- logger.warning(
67
- f"async code being run in dedicated thread pool "
68
- f"to not conflict with existing event loop: {current_loop}"
69
- )
70
-
71
- def wrapped():
72
- return asyncio.run(fn(*args, **kwargs))
73
-
74
- future = thread_pool.submit(wrapped)
75
- return future.result()
76
-
77
- def process_multiprocess(self, iterable: iterable_input) -> Any:
78
- logger.info("processing content across processes")
79
-
80
- if iterable:
81
- if len(iterable) == 1:
82
- return self.process_serially(iterable)
83
- if self.context.num_processes == 1:
84
- return self.process_serially(iterable)
85
- with mp.Pool(
86
- processes=self.context.num_processes,
87
- initializer=self._init_mp,
88
- initargs=(
89
- logging.DEBUG if self.context.verbose else logging.INFO,
90
- self.context.otel_endpoint,
91
- ),
92
- ) as pool:
93
- otel_context = OtelHandler.inject_context()
94
- for iter in iterable:
95
- iter[OtelHandler.trace_context_key] = otel_context
96
- if self.context.tqdm:
97
- return list(
98
- tqdm(
99
- pool.imap_unordered(func=self._wrap_mp, iterable=iterable),
100
- total=len(iterable),
101
- desc=self.identifier,
102
- )
103
- )
104
- return pool.map(self._wrap_mp, iterable)
105
- return [self.run()]
106
-
107
- def _wrap_mp(self, input_kwargs: dict) -> Any:
108
- # Allow mapping of kwargs via multiprocessing map()
109
- return self.run(**input_kwargs)
110
-
111
- def _init_mp(self, log_level: int, endpoint: Optional[str] = None) -> None:
112
- # Init logger for each spawned process when using multiprocessing pool
113
- make_default_logger(level=log_level)
114
- otel_handler = OtelHandler(otel_endpoint=endpoint, log_out=logger.debug)
115
- otel_handler.init_trace()
116
-
117
- @instrument()
118
- def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
119
- iterable = iterable or []
120
- if iterable:
121
- logger.info(
122
- f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
123
- )
124
- else:
125
- logger.info(f"calling {self.__class__.__name__} with no inputs")
126
- if self.context.async_supported and self.process.is_async():
127
- return self.process_async(iterable=iterable)
128
- if self.context.mp_supported:
129
- return self.process_multiprocess(iterable=iterable)
130
- return self.process_serially(iterable=iterable)
131
-
132
- def _run(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
133
- return self.asyncio_run(fn=self.run_async, _fn=fn, **kwargs)
134
-
135
- async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
136
- raise NotImplementedError
137
-
138
- def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
139
- kwargs = kwargs.copy()
140
- otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
141
- tracer = otel_handler.get_tracer()
142
- if trace_context := kwargs.pop(otel_handler.trace_context_key, {}):
143
- otel_handler.attach_context(trace_context=trace_context)
144
- attributes = {}
145
- if file_data_path := kwargs.get("file_data_path"):
146
- attributes["file_id"] = Path(file_data_path).stem
147
- try:
148
- with tracer.start_as_current_span(self.identifier, record_exception=True) as span:
149
- otel_handler.set_attributes(span, attributes)
150
- fn = _fn or self.process.run
151
- return self._run(fn=fn, **kwargs)
152
- except Exception as e:
153
- logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
154
- if "file_data_path" in kwargs:
155
- self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
156
- if self.context.raise_on_error:
157
- raise e
158
- return None
159
-
160
- async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
161
- otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
162
- try:
163
- attributes = {}
164
- if file_data_path := kwargs.get("file_data_path"):
165
- attributes["file_id"] = Path(file_data_path).stem
166
- with otel_handler.get_tracer().start_as_current_span(
167
- self.identifier, record_exception=True
168
- ) as span:
169
- otel_handler.set_attributes(span, attributes)
170
- fn = _fn or self.process.run_async
171
- return await self._run_async(fn=fn, **kwargs)
172
- except Exception as e:
173
- logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
174
- if "file_data_path" in kwargs:
175
- self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
176
- if self.context.raise_on_error:
177
- raise e
178
- return None
179
-
180
- @property
181
- def cache_dir(self) -> Path:
182
- return Path(self.context.work_dir) / self.identifier
183
-
184
- def delete_cache(self):
185
- if self.context.iter_delete and self.cache_dir.exists():
186
- cache_dir = self.cache_dir
187
- logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
188
- shutil.rmtree(cache_dir)
189
-
190
-
191
- @dataclass
192
- class BatchPipelineStep(PipelineStep, ABC):
193
- process: Uploader
194
-
195
- def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
196
- if self.context.mp_supported and self.process.is_batch():
197
- return self.run_batch(contents=iterable)
198
- super().__call__(iterable=iterable)
199
-
200
- @abstractmethod
201
- def _run_batch(self, contents: iterable_input, **kwargs) -> Any:
202
- pass
203
-
204
- def run_batch(self, contents: iterable_input, **kwargs) -> Any:
205
- try:
206
- return self._run_batch(contents=contents, **kwargs)
207
- except Exception as e:
208
- self.context.status[self.identifier] = {"step_error": str(e)}
209
- if self.context.raise_on_error:
210
- raise e
211
- return None