unstructured-ingest 1.0.2__tar.gz → 1.2.34__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (255) hide show
  1. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/.gitignore +1 -0
  2. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/PKG-INFO +18 -9
  3. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/pyproject.toml +15 -11
  4. unstructured_ingest-1.2.34/unstructured_ingest/__version__.py +1 -0
  5. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/README.md +1 -1
  6. unstructured_ingest-1.2.34/unstructured_ingest/data_types/entities.py +17 -0
  7. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/azure_openai.py +11 -4
  8. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/bedrock.py +148 -35
  9. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/huggingface.py +11 -4
  10. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/interfaces.py +11 -8
  11. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/mixedbreadai.py +30 -44
  12. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/octoai.py +27 -6
  13. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/openai.py +51 -8
  14. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/togetherai.py +38 -6
  15. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/vertexai.py +4 -4
  16. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/voyageai.py +10 -7
  17. unstructured_ingest-1.2.34/unstructured_ingest/error.py +156 -0
  18. unstructured_ingest-1.2.34/unstructured_ingest/errors_v2.py +156 -0
  19. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/interfaces/connector.py +7 -1
  20. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/interfaces/downloader.py +2 -0
  21. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/otel.py +18 -1
  22. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/airtable.py +2 -0
  23. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +1 -2
  24. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/astradb.py +100 -8
  25. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  26. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/chroma.py +2 -2
  27. unstructured_ingest-1.2.34/unstructured_ingest/processes/connectors/confluence.py +527 -0
  28. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/databricks/volumes.py +25 -11
  29. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +3 -2
  30. unstructured_ingest-1.2.34/unstructured_ingest/processes/connectors/delta_table.py +310 -0
  31. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/discord.py +4 -3
  32. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +1 -1
  33. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +24 -24
  34. unstructured_ingest-1.2.34/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  35. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/fsspec/azure.py +1 -1
  36. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/fsspec/box.py +1 -1
  37. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +3 -2
  38. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +133 -24
  39. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/fsspec/gcs.py +2 -2
  40. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/fsspec/s3.py +62 -11
  41. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/github.py +8 -3
  42. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/gitlab.py +8 -7
  43. unstructured_ingest-1.2.34/unstructured_ingest/processes/connectors/google_drive.py +848 -0
  44. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +77 -13
  45. unstructured_ingest-1.2.34/unstructured_ingest/processes/connectors/jira.py +522 -0
  46. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/kafka/kafka.py +5 -5
  47. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/local.py +13 -12
  48. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/milvus.py +94 -8
  49. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/mongodb.py +29 -3
  50. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/neo4j.py +59 -24
  51. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/client.py +14 -14
  52. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/connector.py +3 -1
  53. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +40 -3
  54. unstructured_ingest-1.2.34/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  55. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +22 -3
  56. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +1 -0
  57. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +1 -0
  58. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +1 -0
  59. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +1 -0
  60. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +1 -0
  61. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +1 -0
  62. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +1 -0
  63. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +1 -1
  64. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +1 -0
  65. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +1 -0
  66. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +1 -0
  67. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +1 -0
  68. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +1 -0
  69. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +1 -0
  70. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +1 -0
  71. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +1 -0
  72. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +1 -0
  73. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +1 -0
  74. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +1 -0
  75. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +1 -0
  76. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +1 -0
  77. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +1 -0
  78. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/page.py +9 -2
  79. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/user.py +10 -6
  80. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/onedrive.py +25 -7
  81. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/outlook.py +4 -3
  82. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/pinecone.py +35 -7
  83. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/redisdb.py +48 -21
  84. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/salesforce.py +12 -10
  85. unstructured_ingest-1.2.34/unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  86. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/slack.py +6 -4
  87. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/sql/__init__.py +4 -0
  88. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +1 -0
  89. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/sql/snowflake.py +48 -19
  90. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/sql/sql.py +21 -4
  91. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/sql/sqlite.py +1 -0
  92. unstructured_ingest-1.2.34/unstructured_ingest/processes/connectors/sql/teradata.py +253 -0
  93. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/vectara.py +2 -2
  94. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/weaviate/cloud.py +1 -0
  95. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +35 -15
  96. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/zendesk/client.py +8 -2
  97. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +14 -7
  98. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/embedder.py +4 -0
  99. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/partitioner.py +1 -1
  100. unstructured_ingest-1.2.34/unstructured_ingest/processes/utils/__init__.py +8 -0
  101. unstructured_ingest-1.2.34/unstructured_ingest/processes/utils/logging/connector.py +365 -0
  102. unstructured_ingest-1.2.34/unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  103. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/unstructured_api.py +1 -1
  104. unstructured_ingest-1.2.34/unstructured_ingest/utils/__init__.py +5 -0
  105. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/utils/compression.py +2 -1
  106. unstructured_ingest-1.2.34/unstructured_ingest/utils/filesystem.py +27 -0
  107. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/utils/html.py +15 -1
  108. unstructured_ingest-1.2.34/unstructured_ingest/utils/tls.py +15 -0
  109. unstructured_ingest-1.0.2/unstructured_ingest/__version__.py +0 -1
  110. unstructured_ingest-1.0.2/unstructured_ingest/error.py +0 -49
  111. unstructured_ingest-1.0.2/unstructured_ingest/errors_v2.py +0 -25
  112. unstructured_ingest-1.0.2/unstructured_ingest/processes/connectors/confluence.py +0 -308
  113. unstructured_ingest-1.0.2/unstructured_ingest/processes/connectors/delta_table.py +0 -196
  114. unstructured_ingest-1.0.2/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -201
  115. unstructured_ingest-1.0.2/unstructured_ingest/processes/connectors/google_drive.py +0 -488
  116. unstructured_ingest-1.0.2/unstructured_ingest/processes/connectors/jira.py +0 -455
  117. unstructured_ingest-1.0.2/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -57
  118. unstructured_ingest-1.0.2/unstructured_ingest/processes/connectors/sharepoint.py +0 -134
  119. unstructured_ingest-1.0.2/unstructured_ingest/processes/utils/__init__.py +0 -0
  120. unstructured_ingest-1.0.2/unstructured_ingest/utils/__init__.py +0 -0
  121. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/LICENSE.md +0 -0
  122. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/README.md +0 -0
  123. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/__init__.py +0 -0
  124. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/__init__.py +0 -0
  125. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/base/__init__.py +0 -0
  126. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/base/cmd.py +0 -0
  127. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/base/dest.py +0 -0
  128. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/base/importer.py +0 -0
  129. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/base/src.py +0 -0
  130. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/cli.py +0 -0
  131. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/cmds.py +0 -0
  132. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/utils/__init__.py +0 -0
  133. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/utils/click.py +0 -0
  134. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
  135. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/data_types/__init__.py +0 -0
  136. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/data_types/file_data.py +0 -0
  137. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/embed/__init__.py +0 -0
  138. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/interfaces/__init__.py +0 -0
  139. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/interfaces/indexer.py +0 -0
  140. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/interfaces/process.py +0 -0
  141. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/interfaces/processor.py +0 -0
  142. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/interfaces/upload_stager.py +0 -0
  143. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/interfaces/uploader.py +0 -0
  144. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/logger.py +0 -0
  145. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/main.py +0 -0
  146. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/__init__.py +0 -0
  147. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/interfaces.py +0 -0
  148. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/otel.py +0 -0
  149. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/pipeline.py +0 -0
  150. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
  151. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
  152. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/steps/download.py +0 -0
  153. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/steps/embed.py +0 -0
  154. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/steps/filter.py +0 -0
  155. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/steps/index.py +0 -0
  156. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/steps/partition.py +0 -0
  157. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/steps/stage.py +0 -0
  158. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
  159. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/pipeline/steps/upload.py +0 -0
  160. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/__init__.py +0 -0
  161. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/chunker.py +0 -0
  162. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connector_registry.py +0 -0
  163. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/__init__.py +0 -0
  164. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  165. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
  166. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
  167. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
  168. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
  169. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
  170. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
  171. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
  172. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
  173. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
  174. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
  175. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
  176. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
  177. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
  178. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
  179. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
  180. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
  181. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
  182. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
  183. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
  184. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
  185. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
  186. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
  187. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
  188. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
  189. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
  190. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
  191. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  192. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
  193. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
  194. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
  195. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
  196. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
  197. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
  198. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  199. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
  200. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
  201. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
  202. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
  203. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
  204. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
  205. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
  206. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
  207. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
  208. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
  209. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
  210. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
  211. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
  212. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
  213. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
  214. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
  215. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
  216. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
  217. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
  218. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
  219. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
  220. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
  221. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
  222. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
  223. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
  224. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
  225. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
  226. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
  227. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
  228. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
  229. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
  230. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
  231. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
  232. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
  233. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
  234. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
  235. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
  236. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
  237. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
  238. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
  239. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
  240. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/utils.py +0 -0
  241. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
  242. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
  243. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
  244. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  245. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/filter.py +0 -0
  246. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/uncompress.py +0 -0
  247. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
  248. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/utils/chunking.py +0 -0
  249. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/utils/constants.py +0 -0
  250. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/utils/data_prep.py +0 -0
  251. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/utils/dep_check.py +0 -0
  252. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/utils/ndjson.py +0 -0
  253. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/utils/pydantic_models.py +0 -0
  254. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
  255. {unstructured_ingest-1.0.2 → unstructured_ingest-1.2.34}/unstructured_ingest/utils/table.py +0 -0
@@ -108,6 +108,7 @@ celerybeat.pid
108
108
 
109
109
  # Environments
110
110
  .env
111
+ .envrc
111
112
  .venv
112
113
  env/
113
114
  venv/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.2
3
+ Version: 1.2.34
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -12,14 +12,13 @@ Classifier: Intended Audience :: Science/Research
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
19
18
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
- Requires-Python: <3.13,>=3.9
19
+ Requires-Python: <3.13,>=3.10
20
+ Requires-Dist: certifi>=2025.7.14
21
21
  Requires-Dist: click
22
- Requires-Dist: dataclasses-json
23
22
  Requires-Dist: opentelemetry-sdk
24
23
  Requires-Dist: pydantic>=2.7
25
24
  Requires-Dist: python-dateutil
@@ -36,6 +35,7 @@ Provides-Extra: azure-ai-search
36
35
  Requires-Dist: azure-search-documents; extra == 'azure-ai-search'
37
36
  Provides-Extra: bedrock
38
37
  Requires-Dist: aioboto3; extra == 'bedrock'
38
+ Requires-Dist: aiobotocore[boto3]!=2.24.2; extra == 'bedrock'
39
39
  Requires-Dist: boto3; extra == 'bedrock'
40
40
  Provides-Extra: biomed
41
41
  Requires-Dist: bs4; extra == 'biomed'
@@ -56,11 +56,13 @@ Provides-Extra: databricks-delta-tables
56
56
  Requires-Dist: databricks-sql-connector; extra == 'databricks-delta-tables'
57
57
  Requires-Dist: pandas; extra == 'databricks-delta-tables'
58
58
  Provides-Extra: databricks-volumes
59
- Requires-Dist: databricks-sdk; extra == 'databricks-volumes'
59
+ Requires-Dist: databricks-sdk>=0.70.0; extra == 'databricks-volumes'
60
60
  Provides-Extra: delta-table
61
61
  Requires-Dist: boto3; extra == 'delta-table'
62
62
  Requires-Dist: deltalake; extra == 'delta-table'
63
63
  Requires-Dist: pandas; extra == 'delta-table'
64
+ Requires-Dist: pyarrow; extra == 'delta-table'
65
+ Requires-Dist: tenacity; extra == 'delta-table'
64
66
  Provides-Extra: discord
65
67
  Requires-Dist: discord-py; extra == 'discord'
66
68
  Provides-Extra: doc
@@ -74,7 +76,7 @@ Provides-Extra: duckdb
74
76
  Requires-Dist: duckdb; extra == 'duckdb'
75
77
  Requires-Dist: pandas; extra == 'duckdb'
76
78
  Provides-Extra: elasticsearch
77
- Requires-Dist: elasticsearch[async]; extra == 'elasticsearch'
79
+ Requires-Dist: elasticsearch[async]<9.0.0; extra == 'elasticsearch'
78
80
  Provides-Extra: epub
79
81
  Requires-Dist: unstructured[epub]; extra == 'epub'
80
82
  Provides-Extra: gcs
@@ -88,6 +90,7 @@ Provides-Extra: gitlab
88
90
  Requires-Dist: python-gitlab; extra == 'gitlab'
89
91
  Provides-Extra: google-drive
90
92
  Requires-Dist: google-api-python-client; extra == 'google-drive'
93
+ Requires-Dist: tenacity; extra == 'google-drive'
91
94
  Provides-Extra: hubspot
92
95
  Requires-Dist: hubspot-api-client; extra == 'hubspot'
93
96
  Requires-Dist: urllib3; extra == 'hubspot'
@@ -115,7 +118,7 @@ Requires-Dist: unstructured[md]; extra == 'md'
115
118
  Provides-Extra: milvus
116
119
  Requires-Dist: pymilvus; extra == 'milvus'
117
120
  Provides-Extra: mixedbreadai
118
- Requires-Dist: mixedbread-ai; extra == 'mixedbreadai'
121
+ Requires-Dist: mixedbread; extra == 'mixedbreadai'
119
122
  Provides-Extra: mongodb
120
123
  Requires-Dist: pymongo; extra == 'mongodb'
121
124
  Provides-Extra: msg
@@ -142,7 +145,9 @@ Provides-Extra: openai
142
145
  Requires-Dist: openai; extra == 'openai'
143
146
  Requires-Dist: tiktoken; extra == 'openai'
144
147
  Provides-Extra: opensearch
145
- Requires-Dist: opensearch-py; extra == 'opensearch'
148
+ Requires-Dist: boto3>=1.26.0; extra == 'opensearch'
149
+ Requires-Dist: botocore>=1.29.0; extra == 'opensearch'
150
+ Requires-Dist: opensearch-py<3.0.0,>=2.4.0; extra == 'opensearch'
146
151
  Provides-Extra: org
147
152
  Requires-Dist: unstructured[org]; extra == 'org'
148
153
  Provides-Extra: outlook
@@ -164,7 +169,7 @@ Requires-Dist: qdrant-client; extra == 'qdrant'
164
169
  Provides-Extra: reddit
165
170
  Requires-Dist: praw; extra == 'reddit'
166
171
  Provides-Extra: redis
167
- Requires-Dist: redis; extra == 'redis'
172
+ Requires-Dist: redis<=5.3.0; extra == 'redis'
168
173
  Provides-Extra: remote
169
174
  Requires-Dist: unstructured-client>=0.30.0; extra == 'remote'
170
175
  Provides-Extra: rst
@@ -192,6 +197,9 @@ Provides-Extra: snowflake
192
197
  Requires-Dist: pandas; extra == 'snowflake'
193
198
  Requires-Dist: psycopg2-binary; extra == 'snowflake'
194
199
  Requires-Dist: snowflake-connector-python; extra == 'snowflake'
200
+ Provides-Extra: teradata
201
+ Requires-Dist: pandas; extra == 'teradata'
202
+ Requires-Dist: teradatasql; extra == 'teradata'
195
203
  Provides-Extra: togetherai
196
204
  Requires-Dist: together; extra == 'togetherai'
197
205
  Provides-Extra: tsv
@@ -208,6 +216,7 @@ Requires-Dist: requests; extra == 'vectara'
208
216
  Provides-Extra: vertexai
209
217
  Requires-Dist: vertexai; extra == 'vertexai'
210
218
  Provides-Extra: voyageai
219
+ Requires-Dist: langchain-core<1.0.0,>=0.3.81; extra == 'voyageai'
211
220
  Requires-Dist: voyageai; extra == 'voyageai'
212
221
  Provides-Extra: weaviate
213
222
  Requires-Dist: weaviate-client; extra == 'weaviate'
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "unstructured_ingest"
3
3
  description = "Local ETL data pipeline to get data RAG ready"
4
- requires-python = ">=3.9, <3.13"
4
+ requires-python = ">=3.10, <3.13"
5
5
  authors = [{name = "Unstructured Technologies", email = "devops@unstructuredai.io"}]
6
6
  classifiers = [
7
7
  "Development Status :: 4 - Beta",
@@ -11,7 +11,6 @@ classifiers = [
11
11
  "License :: OSI Approved :: Apache Software License",
12
12
  "Operating System :: OS Independent",
13
13
  "Programming Language :: Python :: 3",
14
- "Programming Language :: Python :: 3.9",
15
14
  "Programming Language :: Python :: 3.10",
16
15
  "Programming Language :: Python :: 3.11",
17
16
  "Programming Language :: Python :: 3.12",
@@ -72,6 +71,7 @@ sharepoint = ["requirements/connectors/sharepoint.txt"]
72
71
  singlestore = ["requirements/connectors/singlestore.txt"]
73
72
  slack = ["requirements/connectors/slack.txt"]
74
73
  snowflake = ["requirements/connectors/snowflake.txt"]
74
+ teradata = ["requirements/connectors/teradata.txt"]
75
75
  vastdb = ["requirements/connectors/vastdb.txt"]
76
76
  vectara = ["requirements/connectors/vectara.txt"]
77
77
  weaviate = ["requirements/connectors/weaviate.txt"]
@@ -136,31 +136,25 @@ test = [
136
136
  "deepdiff",
137
137
  "bs4",
138
138
  "pandas",
139
-
140
139
  # Connector specific deps
141
140
  "cryptography",
142
141
  "fsspec",
143
142
  "vertexai",
144
143
  "pyiceberg",
145
144
  "pyarrow",
145
+ "networkx",
146
+ "htmlbuilder",
147
+ "office365-rest-python-client",
146
148
  ]
147
149
  # Add constraints needed for CI
148
150
  ci = [
149
- # consistency with local-inference-pin
150
- "protobuf<4.24",
151
151
  "grpcio>=1.65.5",
152
152
  # TODO: Pinned in transformers package, remove when that gets updated
153
153
  "tokenizers>=0.19,<0.20",
154
- # TODO: Constaint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets
155
- # updated or we drop support for 3.9
156
- "urllib3<1.27",
157
154
  # TODO: Constriant due to aiobotocore, remove when that gets updates:
158
155
  "botocore<1.34.132",
159
156
  # TODO: Constriant due to both 8.5.0 and 8.4.0 being installed during pip-compile
160
157
  "importlib-metadata>=8.5.0",
161
- # TODO: Constraint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets
162
- # updated or we drop support for 3.9
163
- "urllib3<1.27",
164
158
  "unstructured-client>= 0.25.8",
165
159
  "fsspec==2024.5.0",
166
160
  # python 3.12 support
@@ -172,6 +166,8 @@ ci = [
172
166
  "lancedb<=0.15.0",
173
167
  # TODO: versions higher than this are missing the macos wheel
174
168
  "pykx==2.5.3",
169
+ # TODO: Constraint due to perf-analyzer platform compatibility issues
170
+ "tritonclient<=2.60.0", # Allow 2.60.0 (was working), prevent 2.61.0 (has perf-analyzer issues)
175
171
  ]
176
172
 
177
173
  [project.scripts]
@@ -209,3 +205,11 @@ fail_under = 0
209
205
 
210
206
  [tool.hatch.build.targets.sdist]
211
207
  packages = ["/unstructured_ingest"]
208
+
209
+ [tool.codeflash]
210
+ # All paths are relative to this pyproject.toml's directory.
211
+ module-root = "unstructured_ingest"
212
+ tests-root = "test"
213
+ test-framework = "pytest"
214
+ ignore-paths = []
215
+ formatter-cmds = ["ruff check --exit-zero --fix $file", "ruff format $file"]
@@ -0,0 +1 @@
1
+ __version__ = "1.2.34" # pragma: no cover
@@ -7,7 +7,7 @@ source and destination connectors.
7
7
 
8
8
  To manually run the cli:
9
9
  ```shell
10
- PYTHONPATH=. python unstructured_ingest/v2/main.py --help
10
+ PYTHONPATH=. python unstructured_ingest/main.py --help
11
11
  ```
12
12
 
13
13
  The `main.py` file simply wraps the generated Click command created in `cli.py`.
@@ -0,0 +1,17 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class Entity(BaseModel):
5
+ type: str
6
+ entity: str
7
+
8
+
9
+ class EntityRelationship(BaseModel):
10
+ to: str
11
+ from_: str = Field(..., alias="from")
12
+ relationship: str
13
+
14
+
15
+ class EntitiesData(BaseModel):
16
+ items: list[Entity] = Field(default_factory=list)
17
+ relationships: list[EntityRelationship] = Field(default_factory=list)
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.openai import (
9
9
  OpenAIEmbeddingEncoder,
10
10
  )
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.utils.tls import ssl_context_with_optional_ca_override
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  from openai import AsyncAzureOpenAI, AzureOpenAI
@@ -16,14 +17,18 @@ if TYPE_CHECKING:
16
17
 
17
18
  class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
18
19
  api_version: str = Field(description="Azure API version", default="2024-06-01")
19
- azure_endpoint: str
20
- embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
20
+ azure_endpoint: str = Field(description="Azure endpoint")
21
+ embedder_model_name: str = Field(
22
+ default="text-embedding-ada-002", alias="model_name", description="Azure OpenAI model name"
23
+ )
21
24
 
22
25
  @requires_dependencies(["openai"], extras="openai")
23
26
  def get_client(self) -> "AzureOpenAI":
24
- from openai import AzureOpenAI
27
+ from openai import AzureOpenAI, DefaultHttpxClient
25
28
 
29
+ client = DefaultHttpxClient(verify=ssl_context_with_optional_ca_override())
26
30
  return AzureOpenAI(
31
+ http_client=client,
27
32
  api_key=self.api_key.get_secret_value(),
28
33
  api_version=self.api_version,
29
34
  azure_endpoint=self.azure_endpoint,
@@ -31,9 +36,11 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
31
36
 
32
37
  @requires_dependencies(["openai"], extras="openai")
33
38
  def get_async_client(self) -> "AsyncAzureOpenAI":
34
- from openai import AsyncAzureOpenAI
39
+ from openai import AsyncAzureOpenAI, DefaultAsyncHttpxClient
35
40
 
41
+ client = DefaultAsyncHttpxClient(verify=ssl_context_with_optional_ca_override())
36
42
  return AsyncAzureOpenAI(
43
+ http_client=client,
37
44
  api_key=self.api_key.get_secret_value(),
38
45
  api_version=self.api_version,
39
46
  azure_endpoint=self.azure_endpoint,
@@ -13,7 +13,7 @@ from unstructured_ingest.embed.interfaces import (
13
13
  BaseEmbeddingEncoder,
14
14
  EmbeddingConfig,
15
15
  )
16
- from unstructured_ingest.errors_v2 import (
16
+ from unstructured_ingest.error import (
17
17
  ProviderError,
18
18
  RateLimitError,
19
19
  UserAuthError,
@@ -26,16 +26,32 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
26
26
  if TYPE_CHECKING:
27
27
  from botocore.client import BaseClient
28
28
 
29
- class BedrockClient(BaseClient):
30
- def invoke_model(self, body: str, modelId: str, accept: str, contentType: str) -> dict:
29
+ class BedrockRuntimeClient(BaseClient):
30
+ def invoke_model(
31
+ self,
32
+ body: str,
33
+ modelId: str,
34
+ accept: str,
35
+ contentType: str,
36
+ inferenceProfileId: str = None,
37
+ ) -> dict:
31
38
  pass
32
39
 
33
- class AsyncBedrockClient(BaseClient):
40
+ class AsyncBedrockRuntimeClient(BaseClient):
34
41
  async def invoke_model(
35
- self, body: str, modelId: str, accept: str, contentType: str
42
+ self,
43
+ body: str,
44
+ modelId: str,
45
+ accept: str,
46
+ contentType: str,
47
+ inferenceProfileId: str = None,
36
48
  ) -> dict:
37
49
  pass
38
50
 
51
+ class BedrockClient(BaseClient):
52
+ def list_foundation_models(self, byOutputModality: str) -> dict:
53
+ pass
54
+
39
55
 
40
56
  def conform_query(query: str, provider: str) -> dict:
41
57
  # replace newlines, which can negatively affect performance.
@@ -54,10 +70,31 @@ def conform_query(query: str, provider: str) -> dict:
54
70
 
55
71
 
56
72
  class BedrockEmbeddingConfig(EmbeddingConfig):
57
- aws_access_key_id: SecretStr
58
- aws_secret_access_key: SecretStr
59
- region_name: str = "us-west-2"
60
- embedder_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
73
+ aws_access_key_id: SecretStr | None = Field(description="aws access key id", default=None)
74
+ aws_secret_access_key: SecretStr | None = Field(
75
+ description="aws secret access key", default=None
76
+ )
77
+ region_name: str = Field(
78
+ description="aws region name",
79
+ default_factory=lambda: (
80
+ os.getenv("BEDROCK_REGION_NAME") or
81
+ os.getenv("AWS_DEFAULT_REGION") or
82
+ "us-west-2"
83
+ )
84
+ )
85
+ endpoint_url: str | None = Field(description="custom bedrock endpoint url", default=None)
86
+ access_method: str = Field(
87
+ description="authentication method", default="credentials"
88
+ ) # "credentials" or "iam"
89
+ embedder_model_name: str = Field(
90
+ default="amazon.titan-embed-text-v1",
91
+ alias="model_name",
92
+ description="AWS Bedrock model name",
93
+ )
94
+ inference_profile_id: str | None = Field(
95
+ description="AWS Bedrock inference profile ID",
96
+ default_factory=lambda: os.getenv("BEDROCK_INFERENCE_PROFILE_ID"),
97
+ )
61
98
 
62
99
  def wrap_error(self, e: Exception) -> Exception:
63
100
  if is_internal_error(e=e):
@@ -87,19 +124,82 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
87
124
  logger.error(f"unhandled exception from bedrock: {e}", exc_info=True)
88
125
  return e
89
126
 
127
+ def run_precheck(self) -> None:
128
+ # Validate access method and credentials configuration
129
+ if self.access_method == "credentials":
130
+ if not (self.aws_access_key_id and self.aws_secret_access_key):
131
+ raise ValueError(
132
+ "Credentials access method requires aws_access_key_id and aws_secret_access_key"
133
+ )
134
+ elif self.access_method == "iam":
135
+ # For IAM, credentials are handled by AWS SDK
136
+ pass
137
+ else:
138
+ raise ValueError(
139
+ f"Invalid access_method: {self.access_method}. Must be 'credentials' or 'iam'"
140
+ )
141
+
142
+ client = self.get_bedrock_client()
143
+ try:
144
+ model_info = client.list_foundation_models(byOutputModality="EMBEDDING")
145
+ summaries = model_info.get("modelSummaries", [])
146
+ model_ids = [m["modelId"] for m in summaries]
147
+ arns = [":".join(m["modelArn"]) for m in summaries]
148
+
149
+ if self.embedder_model_name not in model_ids and self.embedder_model_name not in arns:
150
+ raise UserError(
151
+ "model '{}' not found either : {} or {}".format(
152
+ self.embedder_model_name, ", ".join(model_ids), ", ".join(arns)
153
+ )
154
+ )
155
+ except Exception as e:
156
+ raise self.wrap_error(e=e)
157
+
158
+ def get_client_kwargs(self) -> dict:
159
+ kwargs = {
160
+ "region_name": self.region_name,
161
+ }
162
+
163
+ if self.endpoint_url:
164
+ kwargs["endpoint_url"] = self.endpoint_url
165
+
166
+ if self.access_method == "credentials":
167
+ if self.aws_access_key_id and self.aws_secret_access_key:
168
+ kwargs["aws_access_key_id"] = self.aws_access_key_id.get_secret_value()
169
+ kwargs["aws_secret_access_key"] = self.aws_secret_access_key.get_secret_value()
170
+ else:
171
+ raise ValueError(
172
+ "Credentials access method requires aws_access_key_id and aws_secret_access_key"
173
+ )
174
+ elif self.access_method == "iam":
175
+ # For IAM, boto3 will use default credential chain (IAM roles, environment, etc.)
176
+ pass
177
+ else:
178
+ raise ValueError(
179
+ f"Invalid access_method: {self.access_method}. Must be 'credentials' or 'iam'"
180
+ )
181
+
182
+ return kwargs
183
+
184
+ @requires_dependencies(
185
+ ["boto3"],
186
+ extras="bedrock",
187
+ )
188
+ def get_bedrock_client(self) -> "BedrockClient":
189
+ import boto3
190
+
191
+ bedrock_client = boto3.client(service_name="bedrock", **self.get_client_kwargs())
192
+
193
+ return bedrock_client
194
+
90
195
  @requires_dependencies(
91
196
  ["boto3", "numpy", "botocore"],
92
197
  extras="bedrock",
93
198
  )
94
- def get_client(self) -> "BedrockClient":
199
+ def get_client(self) -> "BedrockRuntimeClient":
95
200
  import boto3
96
201
 
97
- bedrock_client = boto3.client(
98
- service_name="bedrock-runtime",
99
- aws_access_key_id=self.aws_access_key_id.get_secret_value(),
100
- aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
101
- region_name=self.region_name,
102
- )
202
+ bedrock_client = boto3.client(service_name="bedrock-runtime", **self.get_client_kwargs())
103
203
 
104
204
  return bedrock_client
105
205
 
@@ -108,16 +208,11 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
108
208
  extras="bedrock",
109
209
  )
110
210
  @asynccontextmanager
111
- async def get_async_client(self) -> AsyncIterable["AsyncBedrockClient"]:
211
+ async def get_async_client(self) -> AsyncIterable["AsyncBedrockRuntimeClient"]:
112
212
  import aioboto3
113
213
 
114
214
  session = aioboto3.Session()
115
- async with session.client(
116
- "bedrock-runtime",
117
- aws_access_key_id=self.aws_access_key_id.get_secret_value(),
118
- aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
119
- region_name=self.region_name,
120
- ) as aws_bedrock:
215
+ async with session.client("bedrock-runtime", **self.get_client_kwargs()) as aws_bedrock:
121
216
  yield aws_bedrock
122
217
 
123
218
 
@@ -125,6 +220,9 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
125
220
  class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
126
221
  config: BedrockEmbeddingConfig
127
222
 
223
+ def precheck(self):
224
+ self.config.run_precheck()
225
+
128
226
  def wrap_error(self, e: Exception) -> Exception:
129
227
  return self.config.wrap_error(e=e)
130
228
 
@@ -136,12 +234,18 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
136
234
  bedrock_client = self.config.get_client()
137
235
  # invoke bedrock API
138
236
  try:
139
- response = bedrock_client.invoke_model(
140
- body=json.dumps(body),
141
- modelId=self.config.embedder_model_name,
142
- accept="application/json",
143
- contentType="application/json",
144
- )
237
+ invoke_params = {
238
+ "body": json.dumps(body),
239
+ "modelId": self.config.embedder_model_name,
240
+ "accept": "application/json",
241
+ "contentType": "application/json",
242
+ }
243
+
244
+ # Add inference profile if configured
245
+ if self.config.inference_profile_id:
246
+ invoke_params["inferenceProfileId"] = self.config.inference_profile_id
247
+
248
+ response = bedrock_client.invoke_model(**invoke_params)
145
249
  except Exception as e:
146
250
  raise self.wrap_error(e=e)
147
251
 
@@ -168,6 +272,9 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
168
272
  class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
169
273
  config: BedrockEmbeddingConfig
170
274
 
275
+ def precheck(self):
276
+ self.config.run_precheck()
277
+
171
278
  def wrap_error(self, e: Exception) -> Exception:
172
279
  return self.config.wrap_error(e=e)
173
280
 
@@ -179,12 +286,18 @@ class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
179
286
  async with self.config.get_async_client() as bedrock_client:
180
287
  # invoke bedrock API
181
288
  try:
182
- response = await bedrock_client.invoke_model(
183
- body=json.dumps(body),
184
- modelId=self.config.embedder_model_name,
185
- accept="application/json",
186
- contentType="application/json",
187
- )
289
+ invoke_params = {
290
+ "body": json.dumps(body),
291
+ "modelId": self.config.embedder_model_name,
292
+ "accept": "application/json",
293
+ "contentType": "application/json",
294
+ }
295
+
296
+ # Add inference profile if configured
297
+ if self.config.inference_profile_id:
298
+ invoke_params["inferenceProfileId"] = self.config.inference_profile_id
299
+
300
+ response = await bedrock_client.invoke_model(**invoke_params)
188
301
  except Exception as e:
189
302
  raise self.wrap_error(e=e)
190
303
  async with response.get("body") as client_response:
@@ -15,15 +15,22 @@ if TYPE_CHECKING:
15
15
 
16
16
 
17
17
  class HuggingFaceEmbeddingConfig(EmbeddingConfig):
18
- embedder_model_name: Optional[str] = Field(default="all-MiniLM-L6-v2", alias="model_name")
18
+ embedder_model_name: Optional[str] = Field(
19
+ default="all-MiniLM-L6-v2", alias="model_name", description="HuggingFace model name"
20
+ )
19
21
  embedder_model_kwargs: Optional[dict] = Field(
20
- default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
22
+ default_factory=lambda: {"device": "cpu"},
23
+ alias="model_kwargs",
24
+ description="additional model parameters",
25
+ )
26
+ encode_kwargs: Optional[dict] = Field(
27
+ default_factory=lambda: {"normalize_embeddings": False},
28
+ description="additional embedding parameters",
21
29
  )
22
- encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
23
30
 
24
31
  @requires_dependencies(
25
32
  ["sentence_transformers"],
26
- extras="embed-huggingface",
33
+ extras="huggingface",
27
34
  )
28
35
  def get_client(self) -> "SentenceTransformer":
29
36
  from sentence_transformers import SentenceTransformer
@@ -20,6 +20,9 @@ class EmbeddingConfig(BaseModel):
20
20
  class BaseEncoder(ABC):
21
21
  config: EmbeddingConfig
22
22
 
23
+ def precheck(self):
24
+ pass
25
+
23
26
  def initialize(self):
24
27
  """Initializes the embedding encoder class. Should also validate the instance
25
28
  is properly configured: e.g., embed a single a element"""
@@ -64,14 +67,14 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
64
67
  elements = elements.copy()
65
68
  elements_with_text = [e for e in elements if e.get("text")]
66
69
  texts = [e["text"] for e in elements_with_text]
67
- embeddings = []
70
+ all_embeddings = []
68
71
  try:
69
72
  for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
70
- embeddings = self.embed_batch(client=client, batch=batch)
71
- embeddings.extend(embeddings)
73
+ embeddings_batch = self.embed_batch(client=client, batch=batch)
74
+ all_embeddings.extend(embeddings_batch)
72
75
  except Exception as e:
73
76
  raise self.wrap_error(e=e)
74
- for element, embedding in zip(elements_with_text, embeddings):
77
+ for element, embedding in zip(elements_with_text, all_embeddings, strict=True):
75
78
  element[EMBEDDINGS_KEY] = embedding
76
79
  return elements
77
80
 
@@ -120,14 +123,14 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
120
123
  elements = elements.copy()
121
124
  elements_with_text = [e for e in elements if e.get("text")]
122
125
  texts = [e["text"] for e in elements_with_text]
123
- embeddings = []
126
+ all_embeddings = []
124
127
  try:
125
128
  for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
126
- embeddings = await self.embed_batch(client=client, batch=batch)
127
- embeddings.extend(embeddings)
129
+ embeddings_batch = await self.embed_batch(client=client, batch=batch)
130
+ all_embeddings.extend(embeddings_batch)
128
131
  except Exception as e:
129
132
  raise self.wrap_error(e=e)
130
- for element, embedding in zip(elements_with_text, embeddings):
133
+ for element, embedding in zip(elements_with_text, all_embeddings, strict=True):
131
134
  element[EMBEDDINGS_KEY] = embedding
132
135
  return elements
133
136