unstructured-ingest 0.0.19__tar.gz → 0.0.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (364) hide show
  1. unstructured_ingest-0.0.22/PKG-INFO +186 -0
  2. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/test/test_utils_v2.py +2 -2
  3. unstructured_ingest-0.0.22/unstructured_ingest/__version__.py +1 -0
  4. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/astradb.py +2 -2
  5. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/astradb.py +54 -24
  6. unstructured_ingest-0.0.22/unstructured_ingest/embed/bedrock.py +107 -0
  7. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/embed/huggingface.py +22 -22
  8. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/embed/interfaces.py +11 -4
  9. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/embed/mixedbreadai.py +17 -17
  10. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/embed/octoai.py +7 -7
  11. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/embed/openai.py +15 -20
  12. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/embed/vertexai.py +25 -17
  13. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/embed/voyageai.py +22 -17
  14. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/base/cmd.py +1 -1
  15. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/interfaces/connector.py +1 -1
  16. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/pipeline.py +3 -1
  17. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
  18. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/steps/download.py +6 -2
  19. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
  20. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  21. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/steps/index.py +4 -2
  22. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
  23. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
  24. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  25. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/steps/upload.py +6 -2
  26. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/chunker.py +8 -29
  27. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/airtable.py +1 -1
  28. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/astradb.py +26 -19
  29. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/databricks_volumes.py +11 -8
  30. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/elasticsearch.py +2 -2
  31. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/fsspec/azure.py +31 -5
  32. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/fsspec/box.py +31 -2
  33. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +36 -8
  34. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +25 -77
  35. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +30 -1
  36. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/fsspec/s3.py +15 -18
  37. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +22 -1
  38. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/milvus.py +2 -2
  39. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/opensearch.py +2 -2
  40. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/partitioner.py +9 -55
  41. unstructured_ingest-0.0.22/unstructured_ingest/v2/unstructured_api.py +87 -0
  42. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/utils.py +1 -1
  43. unstructured_ingest-0.0.22/unstructured_ingest.egg-info/PKG-INFO +186 -0
  44. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest.egg-info/SOURCES.txt +1 -0
  45. unstructured_ingest-0.0.22/unstructured_ingest.egg-info/requires.txt +223 -0
  46. unstructured-ingest-0.0.19/PKG-INFO +0 -93
  47. unstructured-ingest-0.0.19/unstructured_ingest/__version__.py +0 -1
  48. unstructured-ingest-0.0.19/unstructured_ingest/embed/bedrock.py +0 -70
  49. unstructured-ingest-0.0.19/unstructured_ingest.egg-info/PKG-INFO +0 -93
  50. unstructured-ingest-0.0.19/unstructured_ingest.egg-info/requires.txt +0 -676
  51. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/LICENSE.md +0 -0
  52. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/README.md +0 -0
  53. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/pyproject.toml +0 -0
  54. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/setup.cfg +0 -0
  55. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/setup.py +0 -0
  56. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/test/test_chunking_utils.py +0 -0
  57. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/test/test_error.py +0 -0
  58. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/test/test_interfaces.py +0 -0
  59. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/test/test_logger.py +0 -0
  60. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/test/test_utils.py +0 -0
  61. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/__init__.py +0 -0
  62. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/__init__.py +0 -0
  63. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/base/__init__.py +0 -0
  64. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/base/cmd.py +0 -0
  65. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/base/dest.py +0 -0
  66. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/base/src.py +0 -0
  67. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cli.py +0 -0
  68. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmd_factory.py +0 -0
  69. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/__init__.py +0 -0
  70. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/airtable.py +0 -0
  71. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/azure_cognitive_search.py +0 -0
  72. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/biomed.py +0 -0
  73. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/chroma.py +0 -0
  74. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/clarifai.py +0 -0
  75. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/confluence.py +0 -0
  76. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/databricks_volumes.py +0 -0
  77. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/delta_table.py +0 -0
  78. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/discord.py +0 -0
  79. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/elasticsearch.py +0 -0
  80. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  81. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/fsspec/azure.py +0 -0
  82. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/fsspec/box.py +0 -0
  83. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -0
  84. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -0
  85. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -0
  86. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/fsspec/s3.py +0 -0
  87. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -0
  88. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/github.py +0 -0
  89. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/gitlab.py +0 -0
  90. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/google_drive.py +0 -0
  91. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/hubspot.py +0 -0
  92. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/jira.py +0 -0
  93. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/kafka.py +0 -0
  94. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/local.py +0 -0
  95. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/mongodb.py +0 -0
  96. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/notion.py +0 -0
  97. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/onedrive.py +0 -0
  98. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/opensearch.py +0 -0
  99. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/outlook.py +0 -0
  100. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/pinecone.py +0 -0
  101. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/qdrant.py +0 -0
  102. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/reddit.py +0 -0
  103. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/salesforce.py +0 -0
  104. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/sharepoint.py +0 -0
  105. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/slack.py +0 -0
  106. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/sql.py +0 -0
  107. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/vectara.py +0 -0
  108. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/weaviate.py +0 -0
  109. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/cmds/wikipedia.py +0 -0
  110. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/common.py +0 -0
  111. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/interfaces.py +0 -0
  112. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/cli/utils.py +0 -0
  113. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/__init__.py +0 -0
  114. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/airtable.py +0 -0
  115. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/azure_cognitive_search.py +0 -0
  116. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/biomed.py +0 -0
  117. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/chroma.py +0 -0
  118. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/clarifai.py +0 -0
  119. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/confluence.py +0 -0
  120. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/databricks_volumes.py +0 -0
  121. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/delta_table.py +0 -0
  122. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/discord.py +0 -0
  123. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/elasticsearch.py +0 -0
  124. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/fsspec/__init__.py +0 -0
  125. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/fsspec/azure.py +0 -0
  126. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/fsspec/box.py +0 -0
  127. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/fsspec/dropbox.py +0 -0
  128. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/fsspec/fsspec.py +0 -0
  129. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/fsspec/gcs.py +0 -0
  130. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/fsspec/s3.py +0 -0
  131. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/fsspec/sftp.py +0 -0
  132. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/git.py +0 -0
  133. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/github.py +0 -0
  134. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/gitlab.py +0 -0
  135. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/google_drive.py +0 -0
  136. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/hubspot.py +0 -0
  137. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/jira.py +0 -0
  138. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/kafka.py +0 -0
  139. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/local.py +0 -0
  140. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/mongodb.py +0 -0
  141. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/__init__.py +0 -0
  142. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/client.py +0 -0
  143. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/connector.py +0 -0
  144. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/helpers.py +0 -0
  145. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/interfaces.py +0 -0
  146. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/__init__.py +0 -0
  147. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/block.py +0 -0
  148. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/__init__.py +0 -0
  149. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -0
  150. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +0 -0
  151. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +0 -0
  152. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/callout.py +0 -0
  153. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -0
  154. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/child_page.py +0 -0
  155. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/code.py +0 -0
  156. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -0
  157. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/divider.py +0 -0
  158. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/embed.py +0 -0
  159. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/equation.py +0 -0
  160. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/file.py +0 -0
  161. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/heading.py +0 -0
  162. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/image.py +0 -0
  163. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -0
  164. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/link_to_page.py +0 -0
  165. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -0
  166. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/paragraph.py +0 -0
  167. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/pdf.py +0 -0
  168. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/quote.py +0 -0
  169. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -0
  170. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/table.py +0 -0
  171. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -0
  172. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/template.py +0 -0
  173. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/todo.py +0 -0
  174. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/toggle.py +0 -0
  175. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -0
  176. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/blocks/video.py +0 -0
  177. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database.py +0 -0
  178. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -0
  179. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -0
  180. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/created_by.py +0 -0
  181. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/created_time.py +0 -0
  182. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/date.py +0 -0
  183. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/email.py +0 -0
  184. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/files.py +0 -0
  185. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -0
  186. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +0 -0
  187. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -0
  188. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -0
  189. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/number.py +0 -0
  190. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/people.py +0 -0
  191. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -0
  192. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -0
  193. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/rich_text.py +0 -0
  194. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/rollup.py +0 -0
  195. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/select.py +0 -0
  196. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/status.py +0 -0
  197. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/title.py +0 -0
  198. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -0
  199. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/url.py +0 -0
  200. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/database_properties/verification.py +0 -0
  201. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/date.py +0 -0
  202. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/file.py +0 -0
  203. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/page.py +0 -0
  204. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/parent.py +0 -0
  205. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/rich_text.py +0 -0
  206. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/notion/types/user.py +0 -0
  207. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/onedrive.py +0 -0
  208. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/opensearch.py +0 -0
  209. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/outlook.py +0 -0
  210. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/pinecone.py +0 -0
  211. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/qdrant.py +0 -0
  212. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/reddit.py +0 -0
  213. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/registry.py +0 -0
  214. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/salesforce.py +0 -0
  215. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/sharepoint.py +0 -0
  216. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/slack.py +0 -0
  217. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/sql.py +0 -0
  218. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/vectara.py +0 -0
  219. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/weaviate.py +0 -0
  220. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/connector/wikipedia.py +0 -0
  221. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/embed/__init__.py +0 -0
  222. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/enhanced_dataclass/__init__.py +0 -0
  223. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/enhanced_dataclass/core.py +0 -0
  224. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -0
  225. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -0
  226. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/error.py +0 -0
  227. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/ingest_backoff/__init__.py +0 -0
  228. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/ingest_backoff/_common.py +0 -0
  229. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/ingest_backoff/_wrapper.py +0 -0
  230. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/interfaces.py +0 -0
  231. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/logger.py +0 -0
  232. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/main.py +0 -0
  233. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/__init__.py +0 -0
  234. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/copy.py +0 -0
  235. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/doc_factory.py +0 -0
  236. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/interfaces.py +0 -0
  237. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/partition.py +0 -0
  238. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/permissions.py +0 -0
  239. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/pipeline.py +0 -0
  240. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  241. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/reformat/chunking.py +0 -0
  242. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/reformat/embedding.py +0 -0
  243. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/source.py +0 -0
  244. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/utils.py +0 -0
  245. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/pipeline/write.py +0 -0
  246. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/processor.py +0 -0
  247. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/__init__.py +0 -0
  248. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/airtable.py +0 -0
  249. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/astradb.py +0 -0
  250. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/base_runner.py +0 -0
  251. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/biomed.py +0 -0
  252. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/confluence.py +0 -0
  253. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/delta_table.py +0 -0
  254. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/discord.py +0 -0
  255. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/elasticsearch.py +0 -0
  256. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/fsspec/__init__.py +0 -0
  257. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/fsspec/azure.py +0 -0
  258. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/fsspec/box.py +0 -0
  259. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/fsspec/dropbox.py +0 -0
  260. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/fsspec/fsspec.py +0 -0
  261. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/fsspec/gcs.py +0 -0
  262. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/fsspec/s3.py +0 -0
  263. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/fsspec/sftp.py +0 -0
  264. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/github.py +0 -0
  265. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/gitlab.py +0 -0
  266. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/google_drive.py +0 -0
  267. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/hubspot.py +0 -0
  268. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/jira.py +0 -0
  269. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/kafka.py +0 -0
  270. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/local.py +0 -0
  271. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/mongodb.py +0 -0
  272. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/notion.py +0 -0
  273. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/onedrive.py +0 -0
  274. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/opensearch.py +0 -0
  275. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/outlook.py +0 -0
  276. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/reddit.py +0 -0
  277. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/salesforce.py +0 -0
  278. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/sharepoint.py +0 -0
  279. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/slack.py +0 -0
  280. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/utils.py +0 -0
  281. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/wikipedia.py +0 -0
  282. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/__init__.py +0 -0
  283. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/astradb.py +0 -0
  284. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -0
  285. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/base_writer.py +0 -0
  286. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/chroma.py +0 -0
  287. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/clarifai.py +0 -0
  288. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/databricks_volumes.py +0 -0
  289. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/delta_table.py +0 -0
  290. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/elasticsearch.py +0 -0
  291. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  292. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/fsspec/azure.py +0 -0
  293. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/fsspec/box.py +0 -0
  294. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -0
  295. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/fsspec/gcs.py +0 -0
  296. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/fsspec/s3.py +0 -0
  297. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/kafka.py +0 -0
  298. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/mongodb.py +0 -0
  299. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/opensearch.py +0 -0
  300. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/pinecone.py +0 -0
  301. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/qdrant.py +0 -0
  302. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/sql.py +0 -0
  303. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/vectara.py +0 -0
  304. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/runner/writers/weaviate.py +0 -0
  305. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/utils/__init__.py +0 -0
  306. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/utils/chunking.py +0 -0
  307. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/utils/compression.py +0 -0
  308. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/utils/data_prep.py +0 -0
  309. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/utils/dep_check.py +0 -0
  310. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/utils/google_filetype.py +0 -0
  311. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
  312. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/utils/table.py +0 -0
  313. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/__init__.py +0 -0
  314. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/__init__.py +0 -0
  315. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/base/__init__.py +0 -0
  316. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/base/dest.py +0 -0
  317. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/base/importer.py +0 -0
  318. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/base/src.py +0 -0
  319. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/cli.py +0 -0
  320. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/cmds.py +0 -0
  321. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  322. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/utils/click.py +0 -0
  323. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/cli/utils/model_conversion.py +0 -0
  324. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/interfaces/__init__.py +0 -0
  325. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/interfaces/downloader.py +0 -0
  326. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/interfaces/file_data.py +0 -0
  327. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/interfaces/indexer.py +0 -0
  328. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/interfaces/process.py +0 -0
  329. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/interfaces/processor.py +0 -0
  330. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/interfaces/upload_stager.py +0 -0
  331. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/interfaces/uploader.py +0 -0
  332. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/logger.py +0 -0
  333. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/main.py +0 -0
  334. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/otel.py +0 -0
  335. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/__init__.py +0 -0
  336. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/interfaces.py +0 -0
  337. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/otel.py +0 -0
  338. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  339. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/__init__.py +0 -0
  340. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connector_registry.py +0 -0
  341. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/__init__.py +0 -0
  342. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +0 -0
  343. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/chroma.py +0 -0
  344. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/couchbase.py +0 -0
  345. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +0 -0
  346. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/fsspec/utils.py +0 -0
  347. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/google_drive.py +0 -0
  348. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/kdbai.py +0 -0
  349. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/local.py +0 -0
  350. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/mongodb.py +0 -0
  351. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/onedrive.py +0 -0
  352. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/pinecone.py +0 -0
  353. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/salesforce.py +0 -0
  354. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/sharepoint.py +0 -0
  355. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/singlestore.py +0 -0
  356. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/sql.py +0 -0
  357. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/utils.py +0 -0
  358. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/connectors/weaviate.py +0 -0
  359. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/embedder.py +0 -0
  360. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/filter.py +0 -0
  361. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest/v2/processes/uncompress.py +0 -0
  362. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest.egg-info/dependency_links.txt +0 -0
  363. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest.egg-info/entry_points.txt +0 -0
  364. {unstructured-ingest-0.0.19 → unstructured_ingest-0.0.22}/unstructured_ingest.egg-info/top_level.txt +0 -0
@@ -0,0 +1,186 @@
1
+ Metadata-Version: 2.1
2
+ Name: unstructured-ingest
3
+ Version: 0.0.22
4
+ Summary: A library that prepares raw documents for downstream ML tasks.
5
+ Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
+ Author: Unstructured Technologies
7
+ Author-email: devops@unstructuredai.io
8
+ License: Apache-2.0
9
+ Keywords: NLP PDF HTML CV XML parsing preprocessing
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Education
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.9.0,<3.13
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE.md
25
+ Requires-Dist: pandas
26
+ Requires-Dist: pydantic>=2.7
27
+ Requires-Dist: click
28
+ Requires-Dist: opentelemetry-sdk
29
+ Requires-Dist: dataclasses_json
30
+ Requires-Dist: tqdm
31
+ Requires-Dist: python-dateutil
32
+ Provides-Extra: remote
33
+ Requires-Dist: unstructured-client>=0.25.8; extra == "remote"
34
+ Provides-Extra: csv
35
+ Requires-Dist: unstructured[tsv]; extra == "csv"
36
+ Provides-Extra: doc
37
+ Requires-Dist: unstructured[docx]; extra == "doc"
38
+ Provides-Extra: docx
39
+ Requires-Dist: unstructured[docx]; extra == "docx"
40
+ Provides-Extra: epub
41
+ Requires-Dist: unstructured[epub]; extra == "epub"
42
+ Provides-Extra: md
43
+ Requires-Dist: unstructured[md]; extra == "md"
44
+ Provides-Extra: msg
45
+ Requires-Dist: unstructured[msg]; extra == "msg"
46
+ Provides-Extra: odt
47
+ Requires-Dist: unstructured[odt]; extra == "odt"
48
+ Provides-Extra: org
49
+ Requires-Dist: unstructured[org]; extra == "org"
50
+ Provides-Extra: pdf
51
+ Requires-Dist: unstructured[pdf]; extra == "pdf"
52
+ Provides-Extra: ppt
53
+ Requires-Dist: unstructured[pptx]; extra == "ppt"
54
+ Provides-Extra: pptx
55
+ Requires-Dist: unstructured[pptx]; extra == "pptx"
56
+ Provides-Extra: rtf
57
+ Requires-Dist: unstructured[rtf]; extra == "rtf"
58
+ Provides-Extra: rst
59
+ Requires-Dist: unstructured[rst]; extra == "rst"
60
+ Provides-Extra: tsv
61
+ Requires-Dist: unstructured[tsv]; extra == "tsv"
62
+ Provides-Extra: xlsx
63
+ Requires-Dist: unstructured[xlsx]; extra == "xlsx"
64
+ Provides-Extra: airtable
65
+ Requires-Dist: pyairtable; extra == "airtable"
66
+ Provides-Extra: astradb
67
+ Requires-Dist: astrapy; extra == "astradb"
68
+ Provides-Extra: azure
69
+ Requires-Dist: adlfs; extra == "azure"
70
+ Requires-Dist: fsspec; extra == "azure"
71
+ Provides-Extra: azure-cognitive-search
72
+ Requires-Dist: azure-search-documents; extra == "azure-cognitive-search"
73
+ Provides-Extra: biomed
74
+ Requires-Dist: requests; extra == "biomed"
75
+ Requires-Dist: bs4; extra == "biomed"
76
+ Provides-Extra: box
77
+ Requires-Dist: fsspec; extra == "box"
78
+ Requires-Dist: boxfs; extra == "box"
79
+ Provides-Extra: chroma
80
+ Requires-Dist: chromadb; extra == "chroma"
81
+ Provides-Extra: clarifai
82
+ Requires-Dist: clarifai; extra == "clarifai"
83
+ Provides-Extra: confluence
84
+ Requires-Dist: requests; extra == "confluence"
85
+ Requires-Dist: atlassian-python-api; extra == "confluence"
86
+ Provides-Extra: couchbase
87
+ Requires-Dist: couchbase; extra == "couchbase"
88
+ Provides-Extra: delta-table
89
+ Requires-Dist: deltalake; extra == "delta-table"
90
+ Requires-Dist: fsspec; extra == "delta-table"
91
+ Provides-Extra: discord
92
+ Requires-Dist: discord-py; extra == "discord"
93
+ Provides-Extra: dropbox
94
+ Requires-Dist: fsspec; extra == "dropbox"
95
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
96
+ Provides-Extra: elasticsearch
97
+ Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
98
+ Provides-Extra: gcs
99
+ Requires-Dist: gcsfs; extra == "gcs"
100
+ Requires-Dist: fsspec; extra == "gcs"
101
+ Requires-Dist: bs4; extra == "gcs"
102
+ Provides-Extra: github
103
+ Requires-Dist: requests; extra == "github"
104
+ Requires-Dist: pygithub>1.58.0; extra == "github"
105
+ Provides-Extra: gitlab
106
+ Requires-Dist: python-gitlab; extra == "gitlab"
107
+ Provides-Extra: google-drive
108
+ Requires-Dist: google-api-python-client; extra == "google-drive"
109
+ Provides-Extra: hubspot
110
+ Requires-Dist: urllib3; extra == "hubspot"
111
+ Requires-Dist: hubspot-api-client; extra == "hubspot"
112
+ Provides-Extra: jira
113
+ Requires-Dist: atlassian-python-api; extra == "jira"
114
+ Provides-Extra: kafka
115
+ Requires-Dist: confluent-kafka; extra == "kafka"
116
+ Provides-Extra: kdbai
117
+ Requires-Dist: kdbai-client; extra == "kdbai"
118
+ Provides-Extra: milvus
119
+ Requires-Dist: pymilvus; extra == "milvus"
120
+ Provides-Extra: mongodb
121
+ Requires-Dist: pymongo; extra == "mongodb"
122
+ Provides-Extra: notion
123
+ Requires-Dist: httpx; extra == "notion"
124
+ Requires-Dist: htmlBuilder; extra == "notion"
125
+ Requires-Dist: notion-client; extra == "notion"
126
+ Requires-Dist: backoff; extra == "notion"
127
+ Provides-Extra: onedrive
128
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
129
+ Requires-Dist: msal; extra == "onedrive"
130
+ Requires-Dist: bs4; extra == "onedrive"
131
+ Provides-Extra: opensearch
132
+ Requires-Dist: opensearch-py; extra == "opensearch"
133
+ Provides-Extra: outlook
134
+ Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
135
+ Requires-Dist: msal; extra == "outlook"
136
+ Provides-Extra: pinecone
137
+ Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
138
+ Provides-Extra: postgres
139
+ Requires-Dist: psycopg2-binary; extra == "postgres"
140
+ Provides-Extra: qdrant
141
+ Requires-Dist: qdrant-client; extra == "qdrant"
142
+ Provides-Extra: reddit
143
+ Requires-Dist: praw; extra == "reddit"
144
+ Provides-Extra: s3
145
+ Requires-Dist: s3fs; extra == "s3"
146
+ Requires-Dist: fsspec; extra == "s3"
147
+ Provides-Extra: sharepoint
148
+ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
149
+ Requires-Dist: msal; extra == "sharepoint"
150
+ Provides-Extra: salesforce
151
+ Requires-Dist: simple-salesforce; extra == "salesforce"
152
+ Provides-Extra: sftp
153
+ Requires-Dist: paramiko; extra == "sftp"
154
+ Requires-Dist: fsspec; extra == "sftp"
155
+ Provides-Extra: slack
156
+ Requires-Dist: slack_sdk; extra == "slack"
157
+ Provides-Extra: wikipedia
158
+ Requires-Dist: wikipedia; extra == "wikipedia"
159
+ Provides-Extra: weaviate
160
+ Requires-Dist: weaviate-client; extra == "weaviate"
161
+ Provides-Extra: databricks-volumes
162
+ Requires-Dist: databricks-sdk; extra == "databricks-volumes"
163
+ Provides-Extra: singlestore
164
+ Requires-Dist: singlestoredb; extra == "singlestore"
165
+ Provides-Extra: vectara
166
+ Requires-Dist: requests; extra == "vectara"
167
+ Provides-Extra: embed-huggingface
168
+ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
169
+ Provides-Extra: embed-octoai
170
+ Requires-Dist: tiktoken; extra == "embed-octoai"
171
+ Requires-Dist: openai; extra == "embed-octoai"
172
+ Provides-Extra: embed-vertexai
173
+ Requires-Dist: vertexai; extra == "embed-vertexai"
174
+ Provides-Extra: embed-voyageai
175
+ Requires-Dist: voyageai; extra == "embed-voyageai"
176
+ Provides-Extra: embed-mixedbreadai
177
+ Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
178
+ Provides-Extra: openai
179
+ Requires-Dist: tiktoken; extra == "openai"
180
+ Requires-Dist: openai; extra == "openai"
181
+ Provides-Extra: bedrock
182
+ Requires-Dist: boto3; extra == "bedrock"
183
+
184
+ # Unstructured Ingest
185
+
186
+ For details, see the [Unstructured Ingest overview](https://docs.unstructured.io/ingestion/overview) in the Unstructured documentation.
@@ -34,7 +34,7 @@ model = MockBaseModel(
34
34
 
35
35
  def test_serialize_base_model():
36
36
 
37
- serialized_dict = model.dict()
37
+ serialized_dict = model.model_dump()
38
38
  assert isinstance(serialized_dict["secret_str"], _SecretBase)
39
39
  assert isinstance(serialized_dict["secret_child_base"], _SecretBase)
40
40
 
@@ -57,7 +57,7 @@ def test_serialize_base_model():
57
57
 
58
58
 
59
59
  def test_serialize_base_model_json():
60
- serialized_json = model.json()
60
+ serialized_json = model.model_dump_json()
61
61
  serialized_dict = json.loads(serialized_json)
62
62
  expected_dict = {
63
63
  "secret_str": "**********",
@@ -0,0 +1 @@
1
+ __version__ = "0.0.22" # pragma: no cover
@@ -37,11 +37,11 @@ class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig):
37
37
  "numbers, and underscores.",
38
38
  ),
39
39
  click.Option(
40
- ["--namespace"],
40
+ ["--keyspace"],
41
41
  required=False,
42
42
  default=None,
43
43
  type=str,
44
- help="The Astra DB connection namespace.",
44
+ help="The Astra DB connection keyspace.",
45
45
  ),
46
46
  ]
47
47
  return options
@@ -24,7 +24,8 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
24
24
  from unstructured_ingest.utils.dep_check import requires_dependencies
25
25
 
26
26
  if t.TYPE_CHECKING:
27
- from astrapy.db import AstraDB, AstraDBCollection
27
+ from astrapy import Collection as AstraDBCollection
28
+ from astrapy import Database as AstraDB
28
29
 
29
30
  NON_INDEXED_FIELDS = ["metadata._node_content", "content"]
30
31
 
@@ -39,6 +40,7 @@ class AstraDBAccessConfig(AccessConfig):
39
40
  class SimpleAstraDBConfig(BaseConnectorConfig):
40
41
  access_config: AstraDBAccessConfig
41
42
  collection_name: str
43
+ keyspace: t.Optional[str] = None
42
44
  namespace: t.Optional[str] = None
43
45
 
44
46
 
@@ -98,22 +100,30 @@ class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
98
100
  @requires_dependencies(["astrapy"], extras="astradb")
99
101
  def astra_db_collection(self) -> "AstraDBCollection":
100
102
  if self._astra_db_collection is None:
101
- from astrapy.db import AstraDB
103
+ from astrapy import DataAPIClient as AstraDBClient
102
104
 
103
- # Build the Astra DB object.
105
+ # Choose keyspace or deprecated namespace
106
+ keyspace_param = self.connector_config.keyspace or self.connector_config.namespace
107
+
108
+ # Create a client object to interact with the Astra DB
104
109
  # caller_name/version for Astra DB tracking
105
- self._astra_db = AstraDB(
106
- api_endpoint=self.connector_config.access_config.api_endpoint,
107
- token=self.connector_config.access_config.token,
108
- namespace=self.connector_config.namespace,
110
+ my_client = AstraDBClient(
109
111
  caller_name=integration_name,
110
112
  caller_version=integration_version,
111
113
  )
112
114
 
113
- # Create and connect to the collection
114
- self._astra_db_collection = self._astra_db.collection(
115
- collection_name=self.connector_config.collection_name,
115
+ # Get the database object
116
+ self._astra_db = my_client.get_database(
117
+ api_endpoint=self.connector_config.access_config.api_endpoint,
118
+ token=self.connector_config.access_config.token,
119
+ keyspace=keyspace_param,
116
120
  )
121
+
122
+ # Create and connect to the newly created collection
123
+ self._astra_db_collection = self._astra_db.get_collection(
124
+ name=self.connector_config.collection_name,
125
+ )
126
+
117
127
  return self._astra_db_collection # type: ignore
118
128
 
119
129
  @requires_dependencies(["astrapy"], extras="astradb")
@@ -132,8 +142,14 @@ class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
132
142
  @requires_dependencies(["astrapy"], extras="astradb")
133
143
  def get_ingest_docs(self): # type: ignore
134
144
  # Perform the find operation
135
- astra_db_docs = list(self.astra_db_collection.paginated_find())
145
+ astra_db_docs_cursor = self.astra_db_collection.find({})
136
146
 
147
+ # Iterate over the cursor
148
+ astra_db_docs = []
149
+ for result in astra_db_docs_cursor:
150
+ astra_db_docs.append(result)
151
+
152
+ # Create a list of AstraDBIngestDoc objects
137
153
  doc_list = []
138
154
  for record in astra_db_docs:
139
155
  doc = AstraDBIngestDoc(
@@ -182,30 +198,41 @@ class AstraDBDestinationConnector(BaseDestinationConnector):
182
198
  @requires_dependencies(["astrapy"], extras="astradb")
183
199
  def astra_db_collection(self) -> "AstraDBCollection":
184
200
  if self._astra_db_collection is None:
185
- from astrapy.db import AstraDB
201
+ from astrapy import DataAPIClient as AstraDBClient
202
+ from astrapy.exceptions import CollectionAlreadyExistsException
203
+
204
+ # Choose keyspace or deprecated namespace
205
+ keyspace_param = self.connector_config.keyspace or self.connector_config.namespace
186
206
 
187
207
  collection_name = self.connector_config.collection_name
188
208
  embedding_dimension = self.write_config.embedding_dimension
189
-
190
- # If the user has requested an indexing policy, pass it to the Astra DB
191
209
  requested_indexing_policy = self.write_config.requested_indexing_policy
192
- options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
193
210
 
211
+ # Create a client object to interact with the Astra DB
194
212
  # caller_name/version for Astra DB tracking
195
- self._astra_db = AstraDB(
196
- api_endpoint=self.connector_config.access_config.api_endpoint,
197
- token=self.connector_config.access_config.token,
198
- namespace=self.connector_config.namespace,
213
+ my_client = AstraDBClient(
199
214
  caller_name=integration_name,
200
215
  caller_version=integration_version,
201
216
  )
202
217
 
203
- # Create and connect to the newly created collection
204
- self._astra_db_collection = self._astra_db.create_collection(
205
- collection_name=collection_name,
206
- dimension=embedding_dimension,
207
- options=options,
218
+ # Get the database object
219
+ self._astra_db = my_client.get_database(
220
+ api_endpoint=self.connector_config.access_config.api_endpoint,
221
+ token=self.connector_config.access_config.token,
222
+ keyspace=keyspace_param,
208
223
  )
224
+
225
+ # Create and connect to the newly created collection
226
+ try:
227
+ self._astra_db_collection = self._astra_db.create_collection(
228
+ name=collection_name,
229
+ dimension=embedding_dimension,
230
+ indexing=requested_indexing_policy,
231
+ )
232
+ except CollectionAlreadyExistsException as e:
233
+ logger.info(f"{e}", exc_info=True)
234
+ self._astra_db_collection = self._astra_db.get_collection(name=collection_name)
235
+
209
236
  return self._astra_db_collection
210
237
 
211
238
  @requires_dependencies(["astrapy"], extras="astradb")
@@ -224,6 +251,9 @@ class AstraDBDestinationConnector(BaseDestinationConnector):
224
251
  def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
225
252
  logger.info(f"inserting / updating {len(elements_dict)} documents to Astra DB.")
226
253
 
254
+ if self._astra_db_collection is None:
255
+ raise DestinationConnectionError("Astra DB collection not available for insertion.")
256
+
227
257
  astra_db_batch_size = self.write_config.batch_size
228
258
 
229
259
  for batch in batch_generator(elements_dict, astra_db_batch_size):
@@ -0,0 +1,107 @@
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING
5
+
6
+ import numpy as np
7
+ from pydantic import Field, SecretStr
8
+
9
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+
12
+ if TYPE_CHECKING:
13
+ from botocore.client import BaseClient
14
+
15
+ class BedrockClient(BaseClient):
16
+ def invoke_model(self, body: str, modelId: str, trace: str) -> dict:
17
+ pass
18
+
19
+
20
+ class BedrockEmbeddingConfig(EmbeddingConfig):
21
+ aws_access_key_id: SecretStr
22
+ aws_secret_access_key: SecretStr
23
+ region_name: str = "us-west-2"
24
+ embed_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
25
+
26
+ @requires_dependencies(
27
+ ["boto3", "numpy", "botocore"],
28
+ extras="bedrock",
29
+ )
30
+ def get_client(self) -> "BedrockClient":
31
+ # delay import only when needed
32
+ import boto3
33
+
34
+ bedrock_client = boto3.client(
35
+ service_name="bedrock-runtime",
36
+ aws_access_key_id=self.aws_access_key_id.get_secret_value(),
37
+ aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
38
+ region_name=self.region_name,
39
+ )
40
+
41
+ return bedrock_client
42
+
43
+
44
+ @dataclass
45
+ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
46
+ config: BedrockEmbeddingConfig
47
+
48
+ def get_exemplary_embedding(self) -> list[float]:
49
+ return self.embed_query(query="Q")
50
+
51
+ def num_of_dimensions(self) -> tuple[int, ...]:
52
+ exemplary_embedding = self.get_exemplary_embedding()
53
+ return np.shape(exemplary_embedding)
54
+
55
+ def is_unit_vector(self) -> bool:
56
+ exemplary_embedding = self.get_exemplary_embedding()
57
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
58
+
59
+ def embed_query(self, query: str) -> list[float]:
60
+ """Call out to Bedrock embedding endpoint."""
61
+ # replace newlines, which can negatively affect performance.
62
+ text = query.replace(os.linesep, " ")
63
+
64
+ # format input body for provider
65
+ provider = self.config.embed_model_name.split(".")[0]
66
+ input_body = {}
67
+ if provider == "cohere":
68
+ if "input_type" not in input_body:
69
+ input_body["input_type"] = "search_document"
70
+ input_body["texts"] = [text]
71
+ else:
72
+ # includes common provider == "amazon"
73
+ input_body["inputText"] = text
74
+ body = json.dumps(input_body)
75
+
76
+ try:
77
+ bedrock_client = self.config.get_client()
78
+ # invoke bedrock API
79
+ response = bedrock_client.invoke_model(
80
+ body=body,
81
+ modelId=self.config.embed_model_name,
82
+ accept="application/json",
83
+ contentType="application/json",
84
+ )
85
+
86
+ # format output based on provider
87
+ response_body = json.loads(response.get("body").read())
88
+ if provider == "cohere":
89
+ return response_body.get("embeddings")[0]
90
+ else:
91
+ # includes common provider == "amazon"
92
+ return response_body.get("embedding")
93
+ except Exception as e:
94
+ raise ValueError(f"Error raised by inference endpoint: {e}")
95
+
96
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
97
+ embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
98
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
99
+ return elements_with_embeddings
100
+
101
+ def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
102
+ assert len(elements) == len(embeddings)
103
+ elements_w_embedding = []
104
+ for i, element in enumerate(elements):
105
+ element["embeddings"] = embeddings[i]
106
+ elements_w_embedding.append(element)
107
+ return elements
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import TYPE_CHECKING, List, Optional
2
+ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  import numpy as np
5
5
  from pydantic import Field
@@ -8,7 +8,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
8
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
 
10
10
  if TYPE_CHECKING:
11
- from langchain_huggingface.embeddings import HuggingFaceEmbeddings
11
+ from sentence_transformers import SentenceTransformer
12
12
 
13
13
 
14
14
  class HuggingFaceEmbeddingConfig(EmbeddingConfig):
@@ -19,51 +19,51 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
19
19
  default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
20
20
  )
21
21
  encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
22
- cache_folder: Optional[dict] = Field(default=None)
22
+ cache_folder: Optional[str] = Field(default=None)
23
23
 
24
24
  @requires_dependencies(
25
- ["langchain_huggingface"],
25
+ ["sentence_transformers"],
26
26
  extras="embed-huggingface",
27
27
  )
28
- def get_client(self) -> "HuggingFaceEmbeddings":
29
- """Creates a langchain Huggingface python client to embed elements."""
30
- from langchain_huggingface.embeddings import HuggingFaceEmbeddings
31
-
32
- client = HuggingFaceEmbeddings(
33
- model_name=self.embedder_model_name,
34
- model_kwargs=self.embedder_model_kwargs,
35
- encode_kwargs=self.encode_kwargs,
28
+ def get_client(self) -> "SentenceTransformer":
29
+ from sentence_transformers import SentenceTransformer
30
+
31
+ return SentenceTransformer(
32
+ model_name_or_path=self.embedder_model_name,
36
33
  cache_folder=self.cache_folder,
34
+ **self.embedder_model_kwargs,
37
35
  )
38
- return client
39
36
 
40
37
 
41
38
  @dataclass
42
39
  class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
43
40
  config: HuggingFaceEmbeddingConfig
44
41
 
45
- def get_exemplary_embedding(self) -> List[float]:
42
+ def get_exemplary_embedding(self) -> list[float]:
46
43
  return self.embed_query(query="Q")
47
44
 
48
- def num_of_dimensions(self):
45
+ def num_of_dimensions(self) -> tuple[int, ...]:
49
46
  exemplary_embedding = self.get_exemplary_embedding()
50
47
  return np.shape(exemplary_embedding)
51
48
 
52
- def is_unit_vector(self):
49
+ def is_unit_vector(self) -> bool:
53
50
  exemplary_embedding = self.get_exemplary_embedding()
54
51
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
55
52
 
56
- def embed_query(self, query):
57
- client = self.config.get_client()
58
- return client.embed_query(str(query))
53
+ def embed_query(self, query: str) -> list[float]:
54
+ return self._embed_documents(texts=[query])[0]
59
55
 
60
- def embed_documents(self, elements: List[dict]) -> List[dict]:
56
+ def _embed_documents(self, texts: list[str]) -> list[list[float]]:
61
57
  client = self.config.get_client()
62
- embeddings = client.embed_documents([e.get("text", "") for e in elements])
58
+ embeddings = client.encode(texts, **self.config.encode_kwargs)
59
+ return embeddings.tolist()
60
+
61
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
62
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
63
63
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
64
64
  return elements_with_embeddings
65
65
 
66
- def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> List[dict]:
66
+ def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> list[dict]:
67
67
  assert len(elements) == len(embeddings)
68
68
  elements_w_embedding = []
69
69
 
@@ -1,6 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from dataclasses import dataclass
3
- from typing import List, Tuple
4
3
 
5
4
  from pydantic import BaseModel
6
5
 
@@ -19,7 +18,7 @@ class BaseEmbeddingEncoder(ABC):
19
18
 
20
19
  @property
21
20
  @abstractmethod
22
- def num_of_dimensions(self) -> Tuple[int]:
21
+ def num_of_dimensions(self) -> tuple[int, ...]:
23
22
  """Number of dimensions for the embedding vector."""
24
23
 
25
24
  @property
@@ -28,9 +27,17 @@ class BaseEmbeddingEncoder(ABC):
28
27
  """Denotes if the embedding vector is a unit vector."""
29
28
 
30
29
  @abstractmethod
31
- def embed_documents(self, elements: List[dict]) -> List[dict]:
30
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
32
31
  pass
33
32
 
34
33
  @abstractmethod
35
- def embed_query(self, query: str) -> List[float]:
34
+ def embed_query(self, query: str) -> list[float]:
36
35
  pass
36
+
37
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
38
+ results = []
39
+ for text in elements:
40
+ response = self.embed_query(query=text)
41
+ results.append(response)
42
+
43
+ return results