unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (568) hide show
  1. examples/airtable.py +44 -0
  2. examples/azure_cognitive_search.py +55 -0
  3. examples/chroma.py +54 -0
  4. examples/couchbase.py +55 -0
  5. examples/databricks_volumes_dest.py +55 -0
  6. examples/databricks_volumes_source.py +53 -0
  7. examples/delta_table.py +45 -0
  8. examples/discord_example.py +36 -0
  9. examples/elasticsearch.py +49 -0
  10. examples/google_drive.py +45 -0
  11. examples/kdbai.py +54 -0
  12. examples/local.py +36 -0
  13. examples/milvus.py +44 -0
  14. examples/mongodb.py +53 -0
  15. examples/opensearch.py +50 -0
  16. examples/pinecone.py +57 -0
  17. examples/s3.py +38 -0
  18. examples/salesforce.py +44 -0
  19. examples/sharepoint.py +47 -0
  20. examples/singlestore.py +49 -0
  21. examples/sql.py +90 -0
  22. examples/vectara.py +54 -0
  23. examples/weaviate.py +44 -0
  24. test/integration/chunkers/test_chunkers.py +1 -1
  25. test/integration/connectors/conftest.py +1 -1
  26. test/integration/connectors/databricks/test_volumes_native.py +3 -3
  27. test/integration/connectors/discord/test_discord.py +1 -1
  28. test/integration/connectors/duckdb/test_duckdb.py +2 -2
  29. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  30. test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
  31. test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
  32. test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
  33. test/integration/connectors/sql/test_postgres.py +2 -2
  34. test/integration/connectors/sql/test_singlestore.py +2 -2
  35. test/integration/connectors/sql/test_snowflake.py +2 -2
  36. test/integration/connectors/sql/test_sqlite.py +2 -2
  37. test/integration/connectors/sql/test_vastdb.py +1 -1
  38. test/integration/connectors/test_astradb.py +2 -2
  39. test/integration/connectors/test_azure_ai_search.py +2 -2
  40. test/integration/connectors/test_chroma.py +2 -2
  41. test/integration/connectors/test_confluence.py +1 -1
  42. test/integration/connectors/test_delta_table.py +2 -2
  43. test/integration/connectors/test_dropbox.py +2 -2
  44. test/integration/connectors/test_github.py +1 -1
  45. test/integration/connectors/test_google_drive.py +2 -2
  46. test/integration/connectors/test_jira.py +1 -1
  47. test/integration/connectors/test_lancedb.py +7 -7
  48. test/integration/connectors/test_milvus.py +2 -2
  49. test/integration/connectors/test_mongodb.py +2 -2
  50. test/integration/connectors/test_neo4j.py +7 -7
  51. test/integration/connectors/test_notion.py +2 -2
  52. test/integration/connectors/test_onedrive.py +2 -2
  53. test/integration/connectors/test_pinecone.py +3 -3
  54. test/integration/connectors/test_qdrant.py +6 -6
  55. test/integration/connectors/test_redis.py +3 -3
  56. test/integration/connectors/test_s3.py +3 -3
  57. test/integration/connectors/test_sharepoint.py +1 -1
  58. test/integration/connectors/test_vectara.py +4 -4
  59. test/integration/connectors/test_zendesk.py +2 -2
  60. test/integration/connectors/utils/validation/destination.py +2 -2
  61. test/integration/connectors/utils/validation/source.py +2 -2
  62. test/integration/connectors/weaviate/test_cloud.py +1 -1
  63. test/integration/connectors/weaviate/test_local.py +2 -2
  64. test/integration/embedders/test_azure_openai.py +1 -1
  65. test/integration/embedders/test_bedrock.py +2 -2
  66. test/integration/embedders/test_huggingface.py +1 -1
  67. test/integration/embedders/test_mixedbread.py +1 -1
  68. test/integration/embedders/test_octoai.py +2 -2
  69. test/integration/embedders/test_openai.py +2 -2
  70. test/integration/embedders/test_togetherai.py +2 -2
  71. test/integration/embedders/test_vertexai.py +1 -1
  72. test/integration/embedders/test_voyageai.py +1 -1
  73. test/integration/partitioners/test_partitioner.py +2 -2
  74. test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
  75. test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
  76. test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
  77. test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
  78. test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
  79. test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
  80. test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
  81. test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
  82. test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
  83. test/unit/test_html.py +1 -1
  84. test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
  85. test/unit/test_utils.py +106 -97
  86. unstructured_ingest/__version__.py +1 -1
  87. unstructured_ingest/cli/__init__.py +0 -14
  88. unstructured_ingest/cli/base/__init__.py +4 -0
  89. unstructured_ingest/cli/base/cmd.py +259 -9
  90. unstructured_ingest/cli/base/dest.py +58 -61
  91. unstructured_ingest/cli/base/src.py +54 -36
  92. unstructured_ingest/cli/cli.py +4 -17
  93. unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
  94. unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
  95. unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
  96. unstructured_ingest/embed/bedrock.py +3 -3
  97. unstructured_ingest/embed/octoai.py +3 -3
  98. unstructured_ingest/embed/openai.py +3 -3
  99. unstructured_ingest/embed/togetherai.py +4 -4
  100. unstructured_ingest/embed/vertexai.py +1 -1
  101. unstructured_ingest/embed/voyageai.py +4 -4
  102. unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
  103. unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
  104. unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
  105. unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
  106. unstructured_ingest/{v2/otel.py → otel.py} +1 -1
  107. unstructured_ingest/pipeline/__init__.py +0 -22
  108. unstructured_ingest/pipeline/interfaces.py +179 -238
  109. unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
  110. unstructured_ingest/pipeline/pipeline.py +388 -97
  111. unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
  112. unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
  113. unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
  114. unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
  115. unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
  116. unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
  117. unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
  118. unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
  119. unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
  120. unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
  121. unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
  122. unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
  123. unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
  124. unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
  125. unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
  126. unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
  127. unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
  128. unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
  129. unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
  130. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
  131. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
  132. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
  133. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
  134. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
  135. unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
  136. unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
  137. unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
  138. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
  139. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
  140. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
  141. unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
  142. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
  143. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
  144. unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
  145. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
  146. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
  147. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
  148. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
  149. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
  150. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
  151. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
  152. unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
  153. unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
  154. unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
  155. unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
  156. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
  157. unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
  158. unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
  159. unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
  160. unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
  161. unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
  162. unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
  163. unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
  164. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
  165. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
  166. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
  167. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
  168. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
  169. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
  170. unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
  171. unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
  172. unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
  173. unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
  174. unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
  175. unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
  176. unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
  177. unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
  178. unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
  179. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  180. unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
  181. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
  182. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
  183. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
  184. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
  185. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
  186. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
  187. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
  188. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
  189. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
  190. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
  191. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
  192. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
  193. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
  194. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
  195. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
  196. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
  197. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
  198. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
  199. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
  200. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
  201. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
  202. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
  203. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
  204. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
  205. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
  206. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
  207. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
  208. unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
  209. unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
  210. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
  211. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
  212. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
  213. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
  214. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
  215. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
  216. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
  217. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
  218. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
  219. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
  220. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
  221. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
  222. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
  223. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
  224. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
  225. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
  226. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
  227. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
  228. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
  229. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
  230. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
  231. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
  232. unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
  233. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
  234. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
  235. unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
  236. unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
  237. unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
  238. unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
  239. unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
  240. unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
  241. unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
  242. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
  243. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
  244. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
  245. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
  246. unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
  247. unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
  248. unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
  249. unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
  250. unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
  251. unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
  252. unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
  253. unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
  254. unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
  255. unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
  256. unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
  257. unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
  258. unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
  259. unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
  260. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
  261. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
  262. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
  263. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
  264. unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
  265. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
  266. unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
  267. unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
  268. unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
  269. unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
  270. unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
  271. unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
  272. unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
  273. unstructured_ingest/utils/compression.py +1 -48
  274. unstructured_ingest/utils/data_prep.py +9 -1
  275. unstructured_ingest/utils/html.py +3 -3
  276. unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
  277. unstructured_ingest/utils/string_and_date_utils.py +1 -1
  278. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
  279. unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
  280. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
  281. test/unit/v2/test_utils.py +0 -82
  282. unstructured_ingest/cli/cmd_factory.py +0 -12
  283. unstructured_ingest/cli/cmds/__init__.py +0 -145
  284. unstructured_ingest/cli/cmds/airtable.py +0 -69
  285. unstructured_ingest/cli/cmds/astradb.py +0 -99
  286. unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
  287. unstructured_ingest/cli/cmds/biomed.py +0 -52
  288. unstructured_ingest/cli/cmds/chroma.py +0 -104
  289. unstructured_ingest/cli/cmds/clarifai.py +0 -71
  290. unstructured_ingest/cli/cmds/confluence.py +0 -69
  291. unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
  292. unstructured_ingest/cli/cmds/delta_table.py +0 -94
  293. unstructured_ingest/cli/cmds/discord.py +0 -47
  294. unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
  295. unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
  296. unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
  297. unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
  298. unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
  299. unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
  300. unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
  301. unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
  302. unstructured_ingest/cli/cmds/github.py +0 -54
  303. unstructured_ingest/cli/cmds/gitlab.py +0 -54
  304. unstructured_ingest/cli/cmds/google_drive.py +0 -49
  305. unstructured_ingest/cli/cmds/hubspot.py +0 -70
  306. unstructured_ingest/cli/cmds/jira.py +0 -71
  307. unstructured_ingest/cli/cmds/kafka.py +0 -102
  308. unstructured_ingest/cli/cmds/local.py +0 -43
  309. unstructured_ingest/cli/cmds/mongodb.py +0 -72
  310. unstructured_ingest/cli/cmds/notion.py +0 -48
  311. unstructured_ingest/cli/cmds/onedrive.py +0 -66
  312. unstructured_ingest/cli/cmds/opensearch.py +0 -117
  313. unstructured_ingest/cli/cmds/outlook.py +0 -67
  314. unstructured_ingest/cli/cmds/pinecone.py +0 -71
  315. unstructured_ingest/cli/cmds/qdrant.py +0 -124
  316. unstructured_ingest/cli/cmds/reddit.py +0 -67
  317. unstructured_ingest/cli/cmds/salesforce.py +0 -58
  318. unstructured_ingest/cli/cmds/sharepoint.py +0 -66
  319. unstructured_ingest/cli/cmds/slack.py +0 -56
  320. unstructured_ingest/cli/cmds/sql.py +0 -66
  321. unstructured_ingest/cli/cmds/vectara.py +0 -66
  322. unstructured_ingest/cli/cmds/weaviate.py +0 -98
  323. unstructured_ingest/cli/cmds/wikipedia.py +0 -40
  324. unstructured_ingest/cli/common.py +0 -7
  325. unstructured_ingest/cli/interfaces.py +0 -663
  326. unstructured_ingest/cli/utils.py +0 -205
  327. unstructured_ingest/connector/airtable.py +0 -309
  328. unstructured_ingest/connector/astradb.py +0 -267
  329. unstructured_ingest/connector/azure_ai_search.py +0 -144
  330. unstructured_ingest/connector/biomed.py +0 -320
  331. unstructured_ingest/connector/chroma.py +0 -158
  332. unstructured_ingest/connector/clarifai.py +0 -122
  333. unstructured_ingest/connector/confluence.py +0 -285
  334. unstructured_ingest/connector/databricks_volumes.py +0 -137
  335. unstructured_ingest/connector/delta_table.py +0 -203
  336. unstructured_ingest/connector/discord.py +0 -180
  337. unstructured_ingest/connector/elasticsearch.py +0 -396
  338. unstructured_ingest/connector/fsspec/azure.py +0 -78
  339. unstructured_ingest/connector/fsspec/box.py +0 -109
  340. unstructured_ingest/connector/fsspec/dropbox.py +0 -160
  341. unstructured_ingest/connector/fsspec/fsspec.py +0 -359
  342. unstructured_ingest/connector/fsspec/gcs.py +0 -82
  343. unstructured_ingest/connector/fsspec/s3.py +0 -62
  344. unstructured_ingest/connector/fsspec/sftp.py +0 -81
  345. unstructured_ingest/connector/git.py +0 -124
  346. unstructured_ingest/connector/github.py +0 -174
  347. unstructured_ingest/connector/gitlab.py +0 -142
  348. unstructured_ingest/connector/google_drive.py +0 -348
  349. unstructured_ingest/connector/hubspot.py +0 -278
  350. unstructured_ingest/connector/jira.py +0 -469
  351. unstructured_ingest/connector/kafka.py +0 -293
  352. unstructured_ingest/connector/local.py +0 -139
  353. unstructured_ingest/connector/mongodb.py +0 -284
  354. unstructured_ingest/connector/notion/client.py +0 -248
  355. unstructured_ingest/connector/notion/connector.py +0 -469
  356. unstructured_ingest/connector/notion/helpers.py +0 -584
  357. unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
  358. unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
  359. unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
  360. unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
  361. unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
  362. unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
  363. unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
  364. unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
  365. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
  366. unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
  367. unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
  368. unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
  369. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
  370. unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
  371. unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
  372. unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
  373. unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
  374. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
  375. unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
  376. unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
  377. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
  378. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
  379. unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
  380. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
  381. unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
  382. unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
  383. unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
  384. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
  385. unstructured_ingest/connector/notion/types/date.py +0 -26
  386. unstructured_ingest/connector/notion/types/file.py +0 -51
  387. unstructured_ingest/connector/notion/types/user.py +0 -76
  388. unstructured_ingest/connector/onedrive.py +0 -232
  389. unstructured_ingest/connector/opensearch.py +0 -218
  390. unstructured_ingest/connector/outlook.py +0 -285
  391. unstructured_ingest/connector/pinecone.py +0 -150
  392. unstructured_ingest/connector/qdrant.py +0 -144
  393. unstructured_ingest/connector/reddit.py +0 -166
  394. unstructured_ingest/connector/registry.py +0 -109
  395. unstructured_ingest/connector/salesforce.py +0 -301
  396. unstructured_ingest/connector/sharepoint.py +0 -573
  397. unstructured_ingest/connector/slack.py +0 -224
  398. unstructured_ingest/connector/sql.py +0 -199
  399. unstructured_ingest/connector/vectara.py +0 -253
  400. unstructured_ingest/connector/weaviate.py +0 -190
  401. unstructured_ingest/connector/wikipedia.py +0 -208
  402. unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
  403. unstructured_ingest/enhanced_dataclass/core.py +0 -99
  404. unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
  405. unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
  406. unstructured_ingest/interfaces.py +0 -852
  407. unstructured_ingest/pipeline/copy.py +0 -19
  408. unstructured_ingest/pipeline/doc_factory.py +0 -12
  409. unstructured_ingest/pipeline/partition.py +0 -60
  410. unstructured_ingest/pipeline/permissions.py +0 -12
  411. unstructured_ingest/pipeline/reformat/chunking.py +0 -134
  412. unstructured_ingest/pipeline/reformat/embedding.py +0 -64
  413. unstructured_ingest/pipeline/source.py +0 -77
  414. unstructured_ingest/pipeline/utils.py +0 -6
  415. unstructured_ingest/pipeline/write.py +0 -18
  416. unstructured_ingest/processor.py +0 -93
  417. unstructured_ingest/runner/__init__.py +0 -104
  418. unstructured_ingest/runner/airtable.py +0 -35
  419. unstructured_ingest/runner/astradb.py +0 -34
  420. unstructured_ingest/runner/base_runner.py +0 -89
  421. unstructured_ingest/runner/biomed.py +0 -45
  422. unstructured_ingest/runner/confluence.py +0 -35
  423. unstructured_ingest/runner/delta_table.py +0 -34
  424. unstructured_ingest/runner/discord.py +0 -35
  425. unstructured_ingest/runner/elasticsearch.py +0 -40
  426. unstructured_ingest/runner/fsspec/azure.py +0 -30
  427. unstructured_ingest/runner/fsspec/box.py +0 -28
  428. unstructured_ingest/runner/fsspec/dropbox.py +0 -30
  429. unstructured_ingest/runner/fsspec/fsspec.py +0 -40
  430. unstructured_ingest/runner/fsspec/gcs.py +0 -28
  431. unstructured_ingest/runner/fsspec/s3.py +0 -28
  432. unstructured_ingest/runner/fsspec/sftp.py +0 -28
  433. unstructured_ingest/runner/github.py +0 -37
  434. unstructured_ingest/runner/gitlab.py +0 -37
  435. unstructured_ingest/runner/google_drive.py +0 -35
  436. unstructured_ingest/runner/hubspot.py +0 -35
  437. unstructured_ingest/runner/jira.py +0 -35
  438. unstructured_ingest/runner/kafka.py +0 -34
  439. unstructured_ingest/runner/local.py +0 -23
  440. unstructured_ingest/runner/mongodb.py +0 -34
  441. unstructured_ingest/runner/notion.py +0 -61
  442. unstructured_ingest/runner/onedrive.py +0 -35
  443. unstructured_ingest/runner/opensearch.py +0 -40
  444. unstructured_ingest/runner/outlook.py +0 -33
  445. unstructured_ingest/runner/reddit.py +0 -35
  446. unstructured_ingest/runner/salesforce.py +0 -33
  447. unstructured_ingest/runner/sharepoint.py +0 -35
  448. unstructured_ingest/runner/slack.py +0 -33
  449. unstructured_ingest/runner/utils.py +0 -47
  450. unstructured_ingest/runner/wikipedia.py +0 -35
  451. unstructured_ingest/runner/writers/__init__.py +0 -48
  452. unstructured_ingest/runner/writers/astradb.py +0 -22
  453. unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
  454. unstructured_ingest/runner/writers/base_writer.py +0 -26
  455. unstructured_ingest/runner/writers/chroma.py +0 -22
  456. unstructured_ingest/runner/writers/clarifai.py +0 -19
  457. unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
  458. unstructured_ingest/runner/writers/delta_table.py +0 -24
  459. unstructured_ingest/runner/writers/elasticsearch.py +0 -24
  460. unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
  461. unstructured_ingest/runner/writers/fsspec/box.py +0 -21
  462. unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
  463. unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
  464. unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
  465. unstructured_ingest/runner/writers/kafka.py +0 -21
  466. unstructured_ingest/runner/writers/mongodb.py +0 -21
  467. unstructured_ingest/runner/writers/opensearch.py +0 -26
  468. unstructured_ingest/runner/writers/pinecone.py +0 -21
  469. unstructured_ingest/runner/writers/qdrant.py +0 -19
  470. unstructured_ingest/runner/writers/sql.py +0 -22
  471. unstructured_ingest/runner/writers/vectara.py +0 -22
  472. unstructured_ingest/runner/writers/weaviate.py +0 -21
  473. unstructured_ingest/utils/google_filetype.py +0 -9
  474. unstructured_ingest/v2/__init__.py +0 -1
  475. unstructured_ingest/v2/cli/__init__.py +0 -0
  476. unstructured_ingest/v2/cli/base/__init__.py +0 -4
  477. unstructured_ingest/v2/cli/base/cmd.py +0 -269
  478. unstructured_ingest/v2/cli/base/dest.py +0 -85
  479. unstructured_ingest/v2/cli/base/src.py +0 -85
  480. unstructured_ingest/v2/cli/cli.py +0 -24
  481. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  482. unstructured_ingest/v2/logger.py +0 -126
  483. unstructured_ingest/v2/main.py +0 -11
  484. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  485. unstructured_ingest/v2/pipeline/interfaces.py +0 -211
  486. unstructured_ingest/v2/pipeline/pipeline.py +0 -408
  487. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  488. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  489. unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
  490. unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
  491. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  492. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
  493. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
  495. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
  496. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
  497. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
  498. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
  499. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
  500. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
  501. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
  502. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
  503. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
  504. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
  505. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
  506. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
  507. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
  508. unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
  515. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
  516. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
  517. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
  518. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
  519. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
  520. unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
  521. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
  522. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
  523. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
  524. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  525. unstructured_ingest/v2/types/__init__.py +0 -0
  526. unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
  527. {test/unit/v2 → examples}/__init__.py +0 -0
  528. /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
  529. /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
  530. /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
  531. /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
  532. /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
  533. /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
  534. /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
  535. /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
  536. /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
  537. /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
  538. /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
  539. /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
  540. /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
  541. /test/unit/{v2/utils → utils}/__init__.py +0 -0
  542. /test/unit/{v2/utils → utils}/data_generator.py +0 -0
  543. /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
  544. /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  545. /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
  546. /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
  547. /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
  548. /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
  549. /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
  550. /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
  551. /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
  552. /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
  553. /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
  554. /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
  555. /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
  556. /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
  557. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
  558. /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
  559. /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
  560. /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
  561. /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
  562. /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
  563. /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
  564. /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
  565. /unstructured_ingest/{v2 → utils}/constants.py +0 -0
  566. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
  567. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
  568. {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
@@ -1,190 +0,0 @@
1
- import copy
2
- import json
3
- import typing as t
4
- from dataclasses import dataclass, field
5
-
6
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
- from unstructured_ingest.enhanced_dataclass.core import _asdict
8
- from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
9
- from unstructured_ingest.interfaces import (
10
- AccessConfig,
11
- BaseConnectorConfig,
12
- BaseDestinationConnector,
13
- WriteConfig,
14
- )
15
- from unstructured_ingest.logger import logger
16
- from unstructured_ingest.utils.dep_check import requires_dependencies
17
-
18
- if t.TYPE_CHECKING:
19
- from weaviate import Client
20
-
21
-
22
- @dataclass
23
- class WeaviateAccessConfig(AccessConfig):
24
- access_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
25
- refresh_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
26
- api_key: t.Optional[str] = enhanced_field(default=None, sensitive=True)
27
- client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
28
- scope: t.Optional[t.List[str]] = None
29
- username: t.Optional[str] = None
30
- password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
31
- anonymous: bool = False
32
-
33
-
34
- @dataclass
35
- class SimpleWeaviateConfig(BaseConnectorConfig):
36
- access_config: WeaviateAccessConfig
37
- host_url: str
38
- class_name: str
39
-
40
-
41
- @dataclass
42
- class WeaviateWriteConfig(WriteConfig):
43
- batch_size: int = 100
44
-
45
-
46
- @dataclass
47
- class WeaviateDestinationConnector(BaseDestinationConnector):
48
- write_config: WeaviateWriteConfig
49
- connector_config: SimpleWeaviateConfig
50
- _client: t.Optional["Client"] = field(init=False, default=None)
51
-
52
- def to_dict(self, **kwargs):
53
- """
54
- The _client variable in this dataclass breaks deepcopy due to:
55
- TypeError: cannot pickle '_thread.lock' object
56
- When serializing, remove it, meaning client data will need to be reinitialized
57
- when deserialized
58
- """
59
- self_cp = copy.copy(self)
60
- if hasattr(self_cp, "_client"):
61
- setattr(self_cp, "_client", None)
62
- return _asdict(self_cp, **kwargs)
63
-
64
- @property
65
- @requires_dependencies(["weaviate"], extras="weaviate")
66
- def client(self) -> "Client":
67
- if self._client is None:
68
- from weaviate import Client
69
-
70
- auth = self._resolve_auth_method()
71
- self._client = Client(url=self.connector_config.host_url, auth_client_secret=auth)
72
- return self._client
73
-
74
- @requires_dependencies(["weaviate"], extras="weaviate")
75
- @DestinationConnectionError.wrap
76
- def initialize(self):
77
- _ = self.client
78
-
79
- @requires_dependencies(["weaviate"], extras="weaviate")
80
- def check_connection(self):
81
- try:
82
- _ = self.client
83
- except Exception as e:
84
- logger.error(f"Failed to validate connection {e}", exc_info=True)
85
- raise SourceConnectionError(f"failed to validate connection: {e}")
86
-
87
- def _resolve_auth_method(self):
88
- access_configs = self.connector_config.access_config
89
- if access_configs.anonymous:
90
- return None
91
-
92
- if access_configs.access_token:
93
- from weaviate.auth import AuthBearerToken
94
-
95
- return AuthBearerToken(
96
- access_token=access_configs.access_token,
97
- refresh_token=access_configs.refresh_token,
98
- )
99
- elif access_configs.api_key:
100
- from weaviate.auth import AuthApiKey
101
-
102
- return AuthApiKey(api_key=access_configs.api_key)
103
- elif access_configs.client_secret:
104
- from weaviate.auth import AuthClientCredentials
105
-
106
- return AuthClientCredentials(
107
- client_secret=access_configs.client_secret, scope=access_configs.scope
108
- )
109
- elif access_configs.username and access_configs.password:
110
- from weaviate.auth import AuthClientPassword
111
-
112
- return AuthClientPassword(
113
- username=access_configs.username,
114
- password=access_configs.password,
115
- scope=access_configs.scope,
116
- )
117
- return None
118
-
119
- def conform_dict(self, data: dict) -> None:
120
- """
121
- Updates the element dictionary to conform to the Weaviate schema
122
- """
123
- from dateutil import parser
124
-
125
- # Dict as string formatting
126
- if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
127
- # Explicit casting otherwise fails schema type checking
128
- data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
129
-
130
- # Array of items as string formatting
131
- if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
132
- data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
133
-
134
- if links := data.get("metadata", {}).get("links", {}):
135
- data["metadata"]["links"] = str(json.dumps(links))
136
-
137
- if permissions_data := (
138
- data.get("metadata", {}).get("data_source", {}).get("permissions_data")
139
- ):
140
- data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
141
-
142
- # Datetime formatting
143
- if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
144
- data["metadata"]["data_source"]["date_created"] = parser.parse(date_created).strftime(
145
- "%Y-%m-%dT%H:%M:%S.%fZ",
146
- )
147
-
148
- if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
149
- data["metadata"]["data_source"]["date_modified"] = parser.parse(date_modified).strftime(
150
- "%Y-%m-%dT%H:%M:%S.%fZ",
151
- )
152
-
153
- if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
154
- data["metadata"]["data_source"]["date_processed"] = parser.parse(
155
- date_processed
156
- ).strftime(
157
- "%Y-%m-%dT%H:%M:%S.%fZ",
158
- )
159
-
160
- if last_modified := data.get("metadata", {}).get("last_modified", {}):
161
- data["metadata"]["last_modified"] = parser.parse(last_modified).strftime(
162
- "%Y-%m-%dT%H:%M:%S.%fZ",
163
- )
164
-
165
- # String casting
166
- if version := data.get("metadata", {}).get("data_source", {}).get("version"):
167
- data["metadata"]["data_source"]["version"] = str(version)
168
-
169
- if page_number := data.get("metadata", {}).get("page_number"):
170
- data["metadata"]["page_number"] = str(page_number)
171
-
172
- if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
173
- data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
174
-
175
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
176
- logger.info(
177
- f"writing {len(elements_dict)} objects to destination "
178
- f"class {self.connector_config.class_name} "
179
- f"at {self.connector_config.host_url}",
180
- )
181
-
182
- self.client.batch.configure(batch_size=self.write_config.batch_size)
183
- with self.client.batch as b:
184
- for e in elements_dict:
185
- vector = e.pop("embeddings", None)
186
- b.add_data_object(
187
- e,
188
- self.connector_config.class_name,
189
- vector=vector,
190
- )
@@ -1,208 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass, field
3
- from pathlib import Path
4
-
5
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
6
- from unstructured_ingest.interfaces import (
7
- BaseConnectorConfig,
8
- BaseSingleIngestDoc,
9
- BaseSourceConnector,
10
- IngestDocCleanupMixin,
11
- SourceConnectorCleanupMixin,
12
- SourceMetadata,
13
- )
14
- from unstructured_ingest.logger import logger
15
- from unstructured_ingest.utils.dep_check import requires_dependencies
16
-
17
- if t.TYPE_CHECKING:
18
- from wikipedia import WikipediaPage
19
-
20
-
21
- @dataclass
22
- class SimpleWikipediaConfig(BaseConnectorConfig):
23
- page_title: str
24
- auto_suggest: bool = False
25
-
26
-
27
- @dataclass
28
- class WikipediaIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
29
- connector_config: SimpleWikipediaConfig = field(repr=False)
30
-
31
- @property
32
- @requires_dependencies(["wikipedia"], extras="wikipedia")
33
- def page(self) -> "WikipediaPage":
34
- import wikipedia
35
-
36
- return wikipedia.page(
37
- self.connector_config.page_title,
38
- auto_suggest=self.connector_config.auto_suggest,
39
- )
40
-
41
- def get_filename_prefix(self) -> str:
42
- title: str = str(self.connector_config.page_title)
43
- title = " ".join(title.split()).replace(" ", "-")
44
- return title
45
-
46
- @property
47
- def filename(self) -> Path:
48
- raise NotImplementedError()
49
-
50
- @property
51
- def text(self) -> str:
52
- raise NotImplementedError()
53
-
54
- @property
55
- def _output_filename(self):
56
- raise NotImplementedError()
57
-
58
- @property
59
- def date_created(self) -> t.Optional[str]:
60
- return None
61
-
62
- @property
63
- def date_modified(self) -> t.Optional[str]:
64
- return None
65
-
66
- @property
67
- def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
68
- return {
69
- "page_title": self.connector_config.page_title,
70
- "page_url": self.source_metadata.source_url, # type: ignore
71
- }
72
-
73
- def _create_full_tmp_dir_path(self):
74
- self.filename.parent.mkdir(parents=True, exist_ok=True)
75
-
76
- @requires_dependencies(["wikipedia"], extras="wikipedia")
77
- def update_source_metadata(self):
78
- from wikipedia.exceptions import PageError
79
-
80
- try:
81
- page = self.page
82
- except PageError:
83
- self.source_metadata = SourceMetadata(
84
- exists=False,
85
- )
86
- return
87
-
88
- self.source_metadata = SourceMetadata(
89
- version=page.revision_id,
90
- source_url=page.url,
91
- exists=True,
92
- )
93
-
94
- @SourceConnectionError.wrap
95
- @BaseSingleIngestDoc.skip_if_file_exists
96
- def get_file(self):
97
- """Fetches the "remote" doc and stores it locally on the filesystem."""
98
- self._create_full_tmp_dir_path()
99
- self.update_source_metadata()
100
- with open(self.filename, "w", encoding="utf8") as f:
101
- f.write(self.text)
102
-
103
-
104
- @dataclass
105
- class WikipediaIngestHTMLDoc(WikipediaIngestDoc):
106
- registry_name: str = "wikipedia_html"
107
-
108
- @property
109
- def filename(self) -> Path:
110
- return (
111
- Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}.html"
112
- ).resolve()
113
-
114
- @property
115
- def text(self):
116
- return self._get_html()
117
-
118
- @SourceConnectionNetworkError.wrap
119
- def _get_html(self):
120
- return self.page.html()
121
-
122
- @property
123
- def _output_filename(self):
124
- return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-html.json"
125
-
126
-
127
- @dataclass
128
- class WikipediaIngestTextDoc(WikipediaIngestDoc):
129
- registry_name: str = "wikipedia_text"
130
-
131
- @property
132
- def filename(self) -> Path:
133
- return (Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}.txt").resolve()
134
-
135
- @property
136
- def text(self):
137
- return self._get_content()
138
-
139
- @SourceConnectionNetworkError.wrap
140
- def _get_content(self):
141
- return self.page.content
142
-
143
- @property
144
- def _output_filename(self):
145
- return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-txt.json"
146
-
147
-
148
- @dataclass
149
- class WikipediaIngestSummaryDoc(WikipediaIngestDoc):
150
- registry_name: str = "wikipedia_summary"
151
-
152
- @property
153
- def filename(self) -> Path:
154
- return (
155
- Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}-summary.txt"
156
- ).resolve()
157
-
158
- @property
159
- def text(self):
160
- return self._get_summary()
161
-
162
- @SourceConnectionNetworkError.wrap
163
- def _get_summary(self):
164
- return self.page.summary
165
-
166
- @property
167
- def _output_filename(self):
168
- return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-summary.json"
169
-
170
-
171
- @dataclass
172
- class WikipediaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
173
- connector_config: SimpleWikipediaConfig
174
-
175
- def initialize(self):
176
- pass
177
-
178
- @requires_dependencies(["wikipedia"], extras="wikipedia")
179
- def check_connection(self):
180
- import wikipedia
181
-
182
- try:
183
- wikipedia.page(
184
- self.connector_config.page_title,
185
- auto_suggest=self.connector_config.auto_suggest,
186
- )
187
- except Exception as e:
188
- logger.error(f"failed to validate connection: {e}", exc_info=True)
189
- raise SourceConnectionError(f"failed to validate connection: {e}")
190
-
191
- def get_ingest_docs(self):
192
- return [
193
- WikipediaIngestTextDoc(
194
- processor_config=self.processor_config,
195
- connector_config=self.connector_config,
196
- read_config=self.read_config,
197
- ),
198
- WikipediaIngestHTMLDoc(
199
- processor_config=self.processor_config,
200
- connector_config=self.connector_config,
201
- read_config=self.read_config,
202
- ),
203
- WikipediaIngestSummaryDoc(
204
- processor_config=self.processor_config,
205
- connector_config=self.connector_config,
206
- read_config=self.read_config,
207
- ),
208
- ]
@@ -1,4 +0,0 @@
1
- from .dataclasses import enhanced_field
2
- from .json_mixin import EnhancedDataClassJsonMixin
3
-
4
- __all__ = ["enhanced_field", "EnhancedDataClassJsonMixin"]
@@ -1,99 +0,0 @@
1
- import _thread
2
- import copy
3
- import functools
4
- from dataclasses import fields
5
-
6
- from dataclasses_json.core import (
7
- Collection,
8
- Enum,
9
- Mapping,
10
- _encode_overrides,
11
- _handle_undefined_parameters_safe,
12
- _user_overrides_or_exts,
13
- is_dataclass,
14
- )
15
-
16
-
17
- def _recursive_repr(user_function):
18
- # Copied from dataclasses as this method isn't exposed for importing
19
- repr_running = set()
20
-
21
- @functools.wraps(user_function)
22
- def wrapper(self):
23
- key = id(self), _thread.get_ident()
24
- if key in repr_running:
25
- return "..."
26
- repr_running.add(key)
27
- try:
28
- result = user_function(self)
29
- finally:
30
- repr_running.discard(key)
31
- return result
32
-
33
- return wrapper
34
-
35
-
36
- def _asdict(
37
- obj,
38
- encode_json=False,
39
- redact_sensitive=False,
40
- redacted_text="***REDACTED***",
41
- apply_name_overload: bool = True,
42
- ):
43
- """
44
- A re-implementation of `asdict` (based on the original in the `dataclasses`
45
- source) to support arbitrary Collection and Mapping types.
46
- """
47
- if is_dataclass(obj):
48
- result = []
49
- overrides = _user_overrides_or_exts(obj)
50
- for field in fields(obj):
51
- if overrides[field.name].encoder:
52
- value = getattr(obj, field.name)
53
- else:
54
- value = _asdict(
55
- getattr(obj, field.name),
56
- encode_json=encode_json,
57
- redact_sensitive=redact_sensitive,
58
- redacted_text=redacted_text,
59
- apply_name_overload=apply_name_overload,
60
- )
61
- if getattr(field, "sensitive", False) and redact_sensitive and value:
62
- value = redacted_text
63
- if getattr(field, "overload_name", None) and apply_name_overload:
64
- overload_name = getattr(field, "overload_name")
65
- result.append((overload_name, value))
66
- else:
67
- result.append((field.name, value))
68
-
69
- result = _handle_undefined_parameters_safe(cls=obj, kvs=dict(result), usage="to")
70
- return _encode_overrides(
71
- dict(result), _user_overrides_or_exts(obj), encode_json=encode_json
72
- )
73
- elif isinstance(obj, Mapping):
74
- return {
75
- _asdict(
76
- k,
77
- encode_json=encode_json,
78
- redact_sensitive=redact_sensitive,
79
- redacted_text=redacted_text,
80
- ): _asdict(
81
- v,
82
- encode_json=encode_json,
83
- redact_sensitive=redact_sensitive,
84
- redacted_text=redacted_text,
85
- )
86
- for k, v in obj.items()
87
- }
88
- elif isinstance(obj, Collection) and not isinstance(obj, (str, bytes, Enum)):
89
- return [
90
- _asdict(
91
- v,
92
- encode_json=encode_json,
93
- redact_sensitive=redact_sensitive,
94
- redacted_text=redacted_text,
95
- )
96
- for v in obj
97
- ]
98
- else:
99
- return copy.deepcopy(obj)
@@ -1,54 +0,0 @@
1
- import typing as t
2
- from dataclasses import MISSING, Field
3
-
4
- from unstructured_ingest.enhanced_dataclass.core import _recursive_repr
5
-
6
-
7
- class EnhancedField(Field):
8
- def __init__(self, *args, sensitive=False, overload_name: t.Optional[str] = None):
9
- super().__init__(*args)
10
- self.sensitive = sensitive
11
- self.overload_name = overload_name
12
-
13
- @_recursive_repr
14
- def __repr__(self):
15
- # Support for kw_only added in 3.10, to support as low as 3.8, need to dynamically map
16
- fields_array = [
17
- f"name={self.name!r}",
18
- f"type={self.type!r}",
19
- f"default={self.default!r}",
20
- f"default_factory={self.default_factory!r}",
21
- f"init={self.init!r}",
22
- f"repr={self.repr!r}",
23
- f"hash={self.hash!r}",
24
- f"compare={self.compare!r}",
25
- f"metadata={self.metadata!r}",
26
- f"sensitive={self.sensitive!r}",
27
- f"overload_name={self.overload_name!r}",
28
- f"_field_type={self._field_type}",
29
- ]
30
- if kw_only := getattr(self, "kw_only", None):
31
- fields_array.append(f"kw_only={kw_only!r}")
32
- return "Field({})".format(",".join(fields_array))
33
-
34
-
35
- def enhanced_field(
36
- *,
37
- default=MISSING,
38
- default_factory=MISSING,
39
- init: bool = True,
40
- repr: bool = True,
41
- hash=None,
42
- compare: bool = True,
43
- metadata=None,
44
- kw_only=MISSING,
45
- sensitive: bool = False,
46
- overload_name: t.Optional[str] = None,
47
- ):
48
- if default is not MISSING and default_factory is not MISSING:
49
- raise ValueError("cannot specify both default and default_factory")
50
- args = [default, default_factory, init, repr, hash, compare, metadata]
51
- # Support for kw_only added in 3.10, to support as low as 3.8, need to dynamically map
52
- if "kw_only" in EnhancedField.__slots__:
53
- args.append(kw_only)
54
- return EnhancedField(*args, sensitive=sensitive, overload_name=overload_name)
@@ -1,125 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- from dataclasses import InitVar, fields
5
- from typing import Any, Callable, Optional, Type, TypeVar, Union
6
-
7
- import dataclasses_json.core as dataclasses_json_core
8
- from dataclasses_json import DataClassJsonMixin
9
-
10
- from unstructured_ingest.enhanced_dataclass.core import _asdict
11
-
12
- A = TypeVar("A", bound="EnhancedDataClassJsonMixin")
13
-
14
- # Monkey-patch _decode_dataclass class to support name override
15
- og_decode_dataclass = dataclasses_json_core._decode_dataclass
16
-
17
-
18
- def custom_decode_dataclass(cls, kvs, infer_missing):
19
- dataclass_fields = fields(cls)
20
- for f in [
21
- field
22
- for field in dataclass_fields
23
- if hasattr(field, "overload_name") and getattr(field, "overload_name", None)
24
- ]:
25
- field_name = f.name
26
- overload_name = getattr(f, "overload_name")
27
- if isinstance(kvs, dict) and overload_name in kvs:
28
- kvs[field_name] = kvs.pop(overload_name)
29
- return og_decode_dataclass(cls, kvs, infer_missing)
30
-
31
-
32
- dataclasses_json_core._decode_dataclass = custom_decode_dataclass
33
-
34
-
35
- class EnhancedDataClassJsonMixin(DataClassJsonMixin):
36
- """A mixin class extending DataClassJsonMixin.
37
-
38
- This class extends the functionality of DataClassJsonMixin to provide enhanced functionality
39
- for JSON serialization and deserialization. It introduces options for redacting sensitive
40
- information, custom encoding, and more advanced schema handling.
41
-
42
- Attributes:
43
- N/A (No additional attributes)
44
-
45
- Methods:
46
- to_json: Serialize the object to JSON format with customizable options.
47
- from_dict: Deserialize a dictionary into an object of this class.
48
- to_dict: Convert the object to a dictionary with customizable options.
49
- schema: Generate a schema for validating and parsing JSON data based on this class.
50
- """
51
-
52
- @classmethod
53
- def check_init_var(cls):
54
- ann = cls.__dict__.get("__annotations__", {})
55
- init_vars = {k: v for k, v in ann.items() if isinstance(v, InitVar)}
56
- if init_vars:
57
- raise TypeError(
58
- "Class {} has the following fields defined with an InitVar which "
59
- "cannot be used with EnhancedDataClassJsonMixin: {}".format(
60
- cls.__name__, ", ".join(init_vars.keys())
61
- )
62
- )
63
-
64
- def to_json(
65
- self,
66
- *,
67
- skipkeys: bool = False,
68
- ensure_ascii: bool = True,
69
- check_circular: bool = True,
70
- allow_nan: bool = True,
71
- indent: Optional[Union[int, str]] = None,
72
- separators: Optional[tuple[str, str]] = None,
73
- default: Optional[Callable[..., Any]] = None,
74
- sort_keys: bool = False,
75
- redact_sensitive: bool = False,
76
- redacted_text: str = "***REDACTED***",
77
- apply_name_overload: bool = True,
78
- **kw: Any,
79
- ) -> str:
80
- self.check_init_var()
81
- return json.dumps(
82
- self.to_dict(
83
- encode_json=False,
84
- redact_sensitive=redact_sensitive,
85
- redacted_text=redacted_text,
86
- apply_name_overload=apply_name_overload,
87
- ),
88
- cls=dataclasses_json_core._ExtendedEncoder,
89
- skipkeys=skipkeys,
90
- ensure_ascii=ensure_ascii,
91
- check_circular=check_circular,
92
- allow_nan=allow_nan,
93
- indent=indent,
94
- separators=separators,
95
- default=default,
96
- sort_keys=sort_keys,
97
- **kw,
98
- )
99
-
100
- @classmethod
101
- def from_dict(
102
- cls: Type[A],
103
- kvs: dataclasses_json_core.Json,
104
- *,
105
- infer_missing=False,
106
- apply_name_overload=False,
107
- ) -> A:
108
- cls.check_init_var()
109
- return dataclasses_json_core._decode_dataclass(cls, kvs, infer_missing)
110
-
111
- def to_dict(
112
- self,
113
- encode_json: bool = False,
114
- redact_sensitive: bool = False,
115
- redacted_text: str = "***REDACTED***",
116
- apply_name_overload: bool = True,
117
- ) -> dict[str, dataclasses_json_core.Json]:
118
- self.check_init_var()
119
- return _asdict(
120
- self,
121
- encode_json=encode_json,
122
- redact_sensitive=redact_sensitive,
123
- redacted_text=redacted_text,
124
- apply_name_overload=apply_name_overload,
125
- )