unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class EmbeddingConfig(BaseModel):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class BaseEncoder(ABC):
|
|
15
|
+
config: EmbeddingConfig
|
|
16
|
+
|
|
17
|
+
def initialize(self):
|
|
18
|
+
"""Initializes the embedding encoder class. Should also validate the instance
|
|
19
|
+
is properly configured: e.g., embed a single a element"""
|
|
20
|
+
|
|
21
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
22
|
+
"""Handle errors from the embedding service. Should raise a more informative error
|
|
23
|
+
if possible"""
|
|
24
|
+
return e
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def _add_embeddings_to_elements(
|
|
28
|
+
elements: list[dict], embeddings: list[list[float]]
|
|
29
|
+
) -> list[dict]:
|
|
30
|
+
"""
|
|
31
|
+
Add embeddings to elements.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
elements (list[Element]): List of elements.
|
|
35
|
+
embeddings (list[list[float]]): List of embeddings.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
list[Element]: Elements with embeddings added.
|
|
39
|
+
"""
|
|
40
|
+
assert len(elements) == len(embeddings)
|
|
41
|
+
elements_w_embedding = []
|
|
42
|
+
for i, element in enumerate(elements):
|
|
43
|
+
element["embeddings"] = embeddings[i]
|
|
44
|
+
elements_w_embedding.append(element)
|
|
45
|
+
return elements
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
50
|
+
|
|
51
|
+
def initialize(self):
|
|
52
|
+
"""Initializes the embedding encoder class. Should also validate the instance
|
|
53
|
+
is properly configured: e.g., embed a single a element"""
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
57
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
58
|
+
return np.shape(exemplary_embedding)
|
|
59
|
+
|
|
60
|
+
def get_exemplary_embedding(self) -> list[float]:
|
|
61
|
+
return self.embed_query(query="Q")
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def is_unit_vector(self) -> bool:
|
|
65
|
+
"""Denotes if the embedding vector is a unit vector."""
|
|
66
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
67
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
68
|
+
|
|
69
|
+
@abstractmethod
|
|
70
|
+
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
def embed_query(self, query: str) -> list[float]:
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
78
|
+
results = []
|
|
79
|
+
for text in elements:
|
|
80
|
+
response = self.embed_query(query=text)
|
|
81
|
+
results.append(response)
|
|
82
|
+
|
|
83
|
+
return results
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
88
|
+
|
|
89
|
+
async def initialize(self):
|
|
90
|
+
"""Initializes the embedding encoder class. Should also validate the instance
|
|
91
|
+
is properly configured: e.g., embed a single a element"""
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
async def num_of_dimensions(self) -> tuple[int, ...]:
|
|
95
|
+
exemplary_embedding = await self.get_exemplary_embedding()
|
|
96
|
+
return np.shape(exemplary_embedding)
|
|
97
|
+
|
|
98
|
+
async def get_exemplary_embedding(self) -> list[float]:
|
|
99
|
+
return await self.embed_query(query="Q")
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
async def is_unit_vector(self) -> bool:
|
|
103
|
+
"""Denotes if the embedding vector is a unit vector."""
|
|
104
|
+
exemplary_embedding = await self.get_exemplary_embedding()
|
|
105
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
106
|
+
|
|
107
|
+
@abstractmethod
|
|
108
|
+
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
@abstractmethod
|
|
112
|
+
async def embed_query(self, query: str) -> list[float]:
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
116
|
+
results = await asyncio.gather(*[self.embed_query(query=text) for text in elements])
|
|
117
|
+
return results
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, SecretStr
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.embed.interfaces import (
|
|
9
|
+
AsyncBaseEmbeddingEncoder,
|
|
10
|
+
BaseEmbeddingEncoder,
|
|
11
|
+
EmbeddingConfig,
|
|
12
|
+
)
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
|
|
15
|
+
USER_AGENT = "@mixedbread-ai/unstructured"
|
|
16
|
+
BATCH_SIZE = 128
|
|
17
|
+
TIMEOUT = 60
|
|
18
|
+
MAX_RETRIES = 3
|
|
19
|
+
ENCODING_FORMAT = "float"
|
|
20
|
+
TRUNCATION_STRATEGY = "end"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from mixedbread_ai.client import AsyncMixedbreadAI, MixedbreadAI
|
|
25
|
+
from mixedbread_ai.core import RequestOptions
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
|
|
29
|
+
"""
|
|
30
|
+
Configuration class for Mixedbread AI Embedding Encoder.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
api_key (str): API key for accessing Mixedbread AI..
|
|
34
|
+
embedder_model_name (str): Name of the model to use for embeddings.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
api_key: SecretStr = Field(
|
|
38
|
+
default_factory=lambda: SecretStr(os.environ.get("MXBAI_API_KEY")),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
embedder_model_name: str = Field(
|
|
42
|
+
default="mixedbread-ai/mxbai-embed-large-v1", alias="model_name"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
@requires_dependencies(
|
|
46
|
+
["mixedbread_ai"],
|
|
47
|
+
extras="embed-mixedbreadai",
|
|
48
|
+
)
|
|
49
|
+
def get_client(self) -> "MixedbreadAI":
|
|
50
|
+
"""
|
|
51
|
+
Create the Mixedbread AI client.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
MixedbreadAI: Initialized client.
|
|
55
|
+
"""
|
|
56
|
+
from mixedbread_ai.client import MixedbreadAI
|
|
57
|
+
|
|
58
|
+
return MixedbreadAI(
|
|
59
|
+
api_key=self.api_key.get_secret_value(),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
@requires_dependencies(
|
|
63
|
+
["mixedbread_ai"],
|
|
64
|
+
extras="embed-mixedbreadai",
|
|
65
|
+
)
|
|
66
|
+
def get_async_client(self) -> "AsyncMixedbreadAI":
|
|
67
|
+
from mixedbread_ai.client import AsyncMixedbreadAI
|
|
68
|
+
|
|
69
|
+
return AsyncMixedbreadAI(
|
|
70
|
+
api_key=self.api_key.get_secret_value(),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
76
|
+
"""
|
|
77
|
+
Embedding encoder for Mixedbread AI.
|
|
78
|
+
|
|
79
|
+
Attributes:
|
|
80
|
+
config (MixedbreadAIEmbeddingConfig): Configuration for the embedding encoder.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
config: MixedbreadAIEmbeddingConfig
|
|
84
|
+
|
|
85
|
+
def get_exemplary_embedding(self) -> list[float]:
|
|
86
|
+
"""Get an exemplary embedding to determine dimensions and unit vector status."""
|
|
87
|
+
return self._embed(["Q"])[0]
|
|
88
|
+
|
|
89
|
+
@requires_dependencies(
|
|
90
|
+
["mixedbread_ai"],
|
|
91
|
+
extras="embed-mixedbreadai",
|
|
92
|
+
)
|
|
93
|
+
def get_request_options(self) -> "RequestOptions":
|
|
94
|
+
from mixedbread_ai.core import RequestOptions
|
|
95
|
+
|
|
96
|
+
return RequestOptions(
|
|
97
|
+
max_retries=MAX_RETRIES,
|
|
98
|
+
timeout_in_seconds=TIMEOUT,
|
|
99
|
+
additional_headers={"User-Agent": USER_AGENT},
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def _embed(self, texts: list[str]) -> list[list[float]]:
|
|
103
|
+
"""
|
|
104
|
+
Embed a list of texts using the Mixedbread AI API.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
texts (list[str]): List of texts to embed.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
list[list[float]]: List of embeddings.
|
|
111
|
+
"""
|
|
112
|
+
batch_size = BATCH_SIZE
|
|
113
|
+
batch_itr = range(0, len(texts), batch_size)
|
|
114
|
+
|
|
115
|
+
responses = []
|
|
116
|
+
client = self.config.get_client()
|
|
117
|
+
for i in batch_itr:
|
|
118
|
+
batch = texts[i : i + batch_size]
|
|
119
|
+
response = client.embeddings(
|
|
120
|
+
model=self.config.embedder_model_name,
|
|
121
|
+
normalized=True,
|
|
122
|
+
encoding_format=ENCODING_FORMAT,
|
|
123
|
+
truncation_strategy=TRUNCATION_STRATEGY,
|
|
124
|
+
request_options=self.get_request_options(),
|
|
125
|
+
input=batch,
|
|
126
|
+
)
|
|
127
|
+
responses.append(response)
|
|
128
|
+
return [item.embedding for response in responses for item in response.data]
|
|
129
|
+
|
|
130
|
+
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
131
|
+
"""
|
|
132
|
+
Embed a list of document elements.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
elements (list[Element]): List of document elements.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
list[Element]: Elements with embeddings.
|
|
139
|
+
"""
|
|
140
|
+
embeddings = self._embed([e.get("text", "") for e in elements])
|
|
141
|
+
return self._add_embeddings_to_elements(elements, embeddings)
|
|
142
|
+
|
|
143
|
+
def embed_query(self, query: str) -> list[float]:
|
|
144
|
+
"""
|
|
145
|
+
Embed a query string.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
query (str): Query string to embed.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
list[float]: Embedding of the query.
|
|
152
|
+
"""
|
|
153
|
+
return self._embed([query])[0]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@dataclass
|
|
157
|
+
class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
158
|
+
|
|
159
|
+
config: MixedbreadAIEmbeddingConfig
|
|
160
|
+
|
|
161
|
+
async def get_exemplary_embedding(self) -> list[float]:
|
|
162
|
+
"""Get an exemplary embedding to determine dimensions and unit vector status."""
|
|
163
|
+
embedding = await self._embed(["Q"])
|
|
164
|
+
return embedding[0]
|
|
165
|
+
|
|
166
|
+
@requires_dependencies(
|
|
167
|
+
["mixedbread_ai"],
|
|
168
|
+
extras="embed-mixedbreadai",
|
|
169
|
+
)
|
|
170
|
+
def get_request_options(self) -> "RequestOptions":
|
|
171
|
+
from mixedbread_ai.core import RequestOptions
|
|
172
|
+
|
|
173
|
+
return RequestOptions(
|
|
174
|
+
max_retries=MAX_RETRIES,
|
|
175
|
+
timeout_in_seconds=TIMEOUT,
|
|
176
|
+
additional_headers={"User-Agent": USER_AGENT},
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
async def _embed(self, texts: list[str]) -> list[list[float]]:
|
|
180
|
+
"""
|
|
181
|
+
Embed a list of texts using the Mixedbread AI API.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
texts (list[str]): List of texts to embed.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
list[list[float]]: List of embeddings.
|
|
188
|
+
"""
|
|
189
|
+
batch_size = BATCH_SIZE
|
|
190
|
+
batch_itr = range(0, len(texts), batch_size)
|
|
191
|
+
|
|
192
|
+
client = self.config.get_async_client()
|
|
193
|
+
tasks = []
|
|
194
|
+
for i in batch_itr:
|
|
195
|
+
batch = texts[i : i + batch_size]
|
|
196
|
+
tasks.append(
|
|
197
|
+
client.embeddings(
|
|
198
|
+
model=self.config.embedder_model_name,
|
|
199
|
+
normalized=True,
|
|
200
|
+
encoding_format=ENCODING_FORMAT,
|
|
201
|
+
truncation_strategy=TRUNCATION_STRATEGY,
|
|
202
|
+
request_options=self.get_request_options(),
|
|
203
|
+
input=batch,
|
|
204
|
+
)
|
|
205
|
+
)
|
|
206
|
+
responses = await asyncio.gather(*tasks)
|
|
207
|
+
return [item.embedding for response in responses for item in response.data]
|
|
208
|
+
|
|
209
|
+
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
210
|
+
"""
|
|
211
|
+
Embed a list of document elements.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
elements (list[Element]): List of document elements.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
list[Element]: Elements with embeddings.
|
|
218
|
+
"""
|
|
219
|
+
embeddings = await self._embed([e.get("text", "") for e in elements])
|
|
220
|
+
return self._add_embeddings_to_elements(elements, embeddings)
|
|
221
|
+
|
|
222
|
+
async def embed_query(self, query: str) -> list[float]:
|
|
223
|
+
"""
|
|
224
|
+
Embed a query string.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
query (str): Query string to embed.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
list[float]: Embedding of the query.
|
|
231
|
+
"""
|
|
232
|
+
embedding = await self._embed([query])
|
|
233
|
+
return embedding[0]
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, SecretStr
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.interfaces import (
|
|
7
|
+
AsyncBaseEmbeddingEncoder,
|
|
8
|
+
BaseEmbeddingEncoder,
|
|
9
|
+
EmbeddingConfig,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.logger import logger
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.errors import (
|
|
14
|
+
ProviderError,
|
|
15
|
+
QuotaError,
|
|
16
|
+
RateLimitError,
|
|
17
|
+
UserAuthError,
|
|
18
|
+
UserError,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from openai import AsyncOpenAI, OpenAI
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OctoAiEmbeddingConfig(EmbeddingConfig):
|
|
26
|
+
api_key: SecretStr
|
|
27
|
+
embedder_model_name: str = Field(default="thenlper/gte-large", alias="model_name")
|
|
28
|
+
base_url: str = Field(default="https://text.octoai.run/v1")
|
|
29
|
+
|
|
30
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
31
|
+
# https://platform.openai.com/docs/guides/error-codes/api-errors
|
|
32
|
+
from openai import APIStatusError
|
|
33
|
+
|
|
34
|
+
if not isinstance(e, APIStatusError):
|
|
35
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
36
|
+
raise e
|
|
37
|
+
error_code = e.code
|
|
38
|
+
if 400 <= e.status_code < 500:
|
|
39
|
+
# user error
|
|
40
|
+
if e.status_code == 401:
|
|
41
|
+
return UserAuthError(e.message)
|
|
42
|
+
if e.status_code == 429:
|
|
43
|
+
# 429 indicates rate limit exceeded and quote exceeded
|
|
44
|
+
if error_code == "insufficient_quota":
|
|
45
|
+
return QuotaError(e.message)
|
|
46
|
+
else:
|
|
47
|
+
return RateLimitError(e.message)
|
|
48
|
+
return UserError(e.message)
|
|
49
|
+
if e.status_code >= 500:
|
|
50
|
+
return ProviderError(e.message)
|
|
51
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
52
|
+
return e
|
|
53
|
+
|
|
54
|
+
@requires_dependencies(
|
|
55
|
+
["openai", "tiktoken"],
|
|
56
|
+
extras="embed-octoai",
|
|
57
|
+
)
|
|
58
|
+
def get_client(self) -> "OpenAI":
|
|
59
|
+
"""Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
|
|
60
|
+
from openai import OpenAI
|
|
61
|
+
|
|
62
|
+
return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
|
|
63
|
+
|
|
64
|
+
@requires_dependencies(
|
|
65
|
+
["openai", "tiktoken"],
|
|
66
|
+
extras="embed-octoai",
|
|
67
|
+
)
|
|
68
|
+
def get_async_client(self) -> "AsyncOpenAI":
|
|
69
|
+
"""Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
|
|
70
|
+
from openai import AsyncOpenAI
|
|
71
|
+
|
|
72
|
+
return AsyncOpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
77
|
+
config: OctoAiEmbeddingConfig
|
|
78
|
+
|
|
79
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
80
|
+
return self.config.wrap_error(e=e)
|
|
81
|
+
|
|
82
|
+
def embed_query(self, query: str):
|
|
83
|
+
try:
|
|
84
|
+
client = self.config.get_client()
|
|
85
|
+
response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
raise self.wrap_error(e=e)
|
|
88
|
+
return response.data[0].embedding
|
|
89
|
+
|
|
90
|
+
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
91
|
+
texts = [e.get("text", "") for e in elements]
|
|
92
|
+
try:
|
|
93
|
+
client = self.config.get_client()
|
|
94
|
+
response = client.embeddings.create(input=texts, model=self.config.embedder_model_name)
|
|
95
|
+
except Exception as e:
|
|
96
|
+
raise self.wrap_error(e=e)
|
|
97
|
+
embeddings = [data.embedding for data in response.data]
|
|
98
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
99
|
+
return elements_with_embeddings
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
104
|
+
config: OctoAiEmbeddingConfig
|
|
105
|
+
|
|
106
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
107
|
+
return self.config.wrap_error(e=e)
|
|
108
|
+
|
|
109
|
+
async def embed_query(self, query: str):
|
|
110
|
+
client = self.config.get_async_client()
|
|
111
|
+
try:
|
|
112
|
+
response = await client.embeddings.create(
|
|
113
|
+
input=query, model=self.config.embedder_model_name
|
|
114
|
+
)
|
|
115
|
+
except Exception as e:
|
|
116
|
+
raise self.wrap_error(e=e)
|
|
117
|
+
return response.data[0].embedding
|
|
118
|
+
|
|
119
|
+
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
120
|
+
texts = [e.get("text", "") for e in elements]
|
|
121
|
+
client = self.config.get_async_client()
|
|
122
|
+
try:
|
|
123
|
+
response = await client.embeddings.create(
|
|
124
|
+
input=texts, model=self.config.embedder_model_name
|
|
125
|
+
)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
raise self.wrap_error(e=e)
|
|
128
|
+
embeddings = [data.embedding for data in response.data]
|
|
129
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
130
|
+
return elements_with_embeddings
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, SecretStr
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.interfaces import (
|
|
7
|
+
AsyncBaseEmbeddingEncoder,
|
|
8
|
+
BaseEmbeddingEncoder,
|
|
9
|
+
EmbeddingConfig,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.logger import logger
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.errors import (
|
|
14
|
+
ProviderError,
|
|
15
|
+
QuotaError,
|
|
16
|
+
RateLimitError,
|
|
17
|
+
UserAuthError,
|
|
18
|
+
UserError,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from openai import AsyncOpenAI, OpenAI
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
26
|
+
api_key: SecretStr
|
|
27
|
+
embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
|
|
28
|
+
|
|
29
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
30
|
+
# https://platform.openai.com/docs/guides/error-codes/api-errors
|
|
31
|
+
from openai import APIStatusError
|
|
32
|
+
|
|
33
|
+
if not isinstance(e, APIStatusError):
|
|
34
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
35
|
+
raise e
|
|
36
|
+
error_code = e.code
|
|
37
|
+
if 400 <= e.status_code < 500:
|
|
38
|
+
# user error
|
|
39
|
+
if e.status_code == 401:
|
|
40
|
+
return UserAuthError(e.message)
|
|
41
|
+
if e.status_code == 429:
|
|
42
|
+
# 429 indicates rate limit exceeded and quote exceeded
|
|
43
|
+
if error_code == "insufficient_quota":
|
|
44
|
+
return QuotaError(e.message)
|
|
45
|
+
else:
|
|
46
|
+
return RateLimitError(e.message)
|
|
47
|
+
return UserError(e.message)
|
|
48
|
+
if e.status_code >= 500:
|
|
49
|
+
return ProviderError(e.message)
|
|
50
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
51
|
+
return e
|
|
52
|
+
|
|
53
|
+
@requires_dependencies(["openai"], extras="openai")
|
|
54
|
+
def get_client(self) -> "OpenAI":
|
|
55
|
+
from openai import OpenAI
|
|
56
|
+
|
|
57
|
+
return OpenAI(api_key=self.api_key.get_secret_value())
|
|
58
|
+
|
|
59
|
+
@requires_dependencies(["openai"], extras="openai")
|
|
60
|
+
def get_async_client(self) -> "AsyncOpenAI":
|
|
61
|
+
from openai import AsyncOpenAI
|
|
62
|
+
|
|
63
|
+
return AsyncOpenAI(api_key=self.api_key.get_secret_value())
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
68
|
+
config: OpenAIEmbeddingConfig
|
|
69
|
+
|
|
70
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
71
|
+
return self.config.wrap_error(e=e)
|
|
72
|
+
|
|
73
|
+
def embed_query(self, query: str) -> list[float]:
|
|
74
|
+
|
|
75
|
+
client = self.config.get_client()
|
|
76
|
+
try:
|
|
77
|
+
response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
raise self.wrap_error(e=e)
|
|
80
|
+
return response.data[0].embedding
|
|
81
|
+
|
|
82
|
+
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
83
|
+
embeddings = self._embed_documents([e.get("text", "") for e in elements])
|
|
84
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
85
|
+
return elements_with_embeddings
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
90
|
+
config: OpenAIEmbeddingConfig
|
|
91
|
+
|
|
92
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
93
|
+
return self.config.wrap_error(e=e)
|
|
94
|
+
|
|
95
|
+
async def embed_query(self, query: str) -> list[float]:
|
|
96
|
+
client = self.config.get_async_client()
|
|
97
|
+
try:
|
|
98
|
+
response = await client.embeddings.create(
|
|
99
|
+
input=query, model=self.config.embedder_model_name
|
|
100
|
+
)
|
|
101
|
+
except Exception as e:
|
|
102
|
+
raise self.wrap_error(e=e)
|
|
103
|
+
return response.data[0].embedding
|
|
104
|
+
|
|
105
|
+
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
106
|
+
client = self.config.get_async_client()
|
|
107
|
+
texts = [e.get("text", "") for e in elements]
|
|
108
|
+
try:
|
|
109
|
+
response = await client.embeddings.create(
|
|
110
|
+
input=texts, model=self.config.embedder_model_name
|
|
111
|
+
)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
raise self.wrap_error(e=e)
|
|
114
|
+
embeddings = [data.embedding for data in response.data]
|
|
115
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
116
|
+
return elements_with_embeddings
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, SecretStr
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.interfaces import (
|
|
7
|
+
AsyncBaseEmbeddingEncoder,
|
|
8
|
+
BaseEmbeddingEncoder,
|
|
9
|
+
EmbeddingConfig,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.logger import logger
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.errors import (
|
|
14
|
+
RateLimitError as CustomRateLimitError,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.v2.errors import (
|
|
17
|
+
UserAuthError,
|
|
18
|
+
UserError,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from together import AsyncTogether, Together
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TogetherAIEmbeddingConfig(EmbeddingConfig):
|
|
26
|
+
api_key: SecretStr
|
|
27
|
+
embedder_model_name: str = Field(
|
|
28
|
+
default="togethercomputer/m2-bert-80M-8k-retrieval", alias="model_name"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
32
|
+
# https://docs.together.ai/docs/error-codes
|
|
33
|
+
from together.error import AuthenticationError, RateLimitError, TogetherException
|
|
34
|
+
|
|
35
|
+
if not isinstance(e, TogetherException):
|
|
36
|
+
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
37
|
+
return e
|
|
38
|
+
message = e.args[0]
|
|
39
|
+
if isinstance(e, AuthenticationError):
|
|
40
|
+
return UserAuthError(message)
|
|
41
|
+
if isinstance(e, RateLimitError):
|
|
42
|
+
return CustomRateLimitError(message)
|
|
43
|
+
return UserError(message)
|
|
44
|
+
|
|
45
|
+
@requires_dependencies(["together"], extras="togetherai")
|
|
46
|
+
def get_client(self) -> "Together":
|
|
47
|
+
from together import Together
|
|
48
|
+
|
|
49
|
+
return Together(api_key=self.api_key.get_secret_value())
|
|
50
|
+
|
|
51
|
+
@requires_dependencies(["together"], extras="togetherai")
|
|
52
|
+
def get_async_client(self) -> "AsyncTogether":
|
|
53
|
+
from together import AsyncTogether
|
|
54
|
+
|
|
55
|
+
return AsyncTogether(api_key=self.api_key.get_secret_value())
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
60
|
+
config: TogetherAIEmbeddingConfig
|
|
61
|
+
|
|
62
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
63
|
+
return self.config.wrap_error(e=e)
|
|
64
|
+
|
|
65
|
+
def embed_query(self, query: str) -> list[float]:
|
|
66
|
+
return self._embed_documents(elements=[query])[0]
|
|
67
|
+
|
|
68
|
+
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
69
|
+
embeddings = self._embed_documents([e.get("text", "") for e in elements])
|
|
70
|
+
return self._add_embeddings_to_elements(elements, embeddings)
|
|
71
|
+
|
|
72
|
+
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
73
|
+
client = self.config.get_client()
|
|
74
|
+
try:
|
|
75
|
+
outputs = client.embeddings.create(
|
|
76
|
+
model=self.config.embedder_model_name, input=elements
|
|
77
|
+
)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
raise self.wrap_error(e=e)
|
|
80
|
+
return [outputs.data[i].embedding for i in range(len(elements))]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class AsyncTogetherAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
85
|
+
config: TogetherAIEmbeddingConfig
|
|
86
|
+
|
|
87
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
88
|
+
return self.config.wrap_error(e=e)
|
|
89
|
+
|
|
90
|
+
async def embed_query(self, query: str) -> list[float]:
|
|
91
|
+
embedding = await self._embed_documents(elements=[query])
|
|
92
|
+
return embedding[0]
|
|
93
|
+
|
|
94
|
+
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
95
|
+
embeddings = await self._embed_documents([e.get("text", "") for e in elements])
|
|
96
|
+
return self._add_embeddings_to_elements(elements, embeddings)
|
|
97
|
+
|
|
98
|
+
async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
99
|
+
client = self.config.get_async_client()
|
|
100
|
+
try:
|
|
101
|
+
outputs = await client.embeddings.create(
|
|
102
|
+
model=self.config.embedder_model_name, input=elements
|
|
103
|
+
)
|
|
104
|
+
except Exception as e:
|
|
105
|
+
raise self.wrap_error(e=e)
|
|
106
|
+
return [outputs.data[i].embedding for i in range(len(elements))]
|