unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
import typing as t
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
8
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
9
|
+
from unstructured_ingest.interfaces import (
|
|
10
|
+
AccessConfig,
|
|
11
|
+
BaseConnectorConfig,
|
|
12
|
+
BaseDestinationConnector,
|
|
13
|
+
WriteConfig,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.logger import logger
|
|
16
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
+
|
|
18
|
+
if t.TYPE_CHECKING:
|
|
19
|
+
from weaviate import Client
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class WeaviateAccessConfig(AccessConfig):
|
|
24
|
+
access_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
25
|
+
refresh_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
26
|
+
api_key: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
27
|
+
client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
28
|
+
scope: t.Optional[t.List[str]] = None
|
|
29
|
+
username: t.Optional[str] = None
|
|
30
|
+
password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
31
|
+
anonymous: bool = False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class SimpleWeaviateConfig(BaseConnectorConfig):
|
|
36
|
+
access_config: WeaviateAccessConfig
|
|
37
|
+
host_url: str
|
|
38
|
+
class_name: str
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class WeaviateWriteConfig(WriteConfig):
|
|
43
|
+
batch_size: int = 100
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class WeaviateDestinationConnector(BaseDestinationConnector):
|
|
48
|
+
write_config: WeaviateWriteConfig
|
|
49
|
+
connector_config: SimpleWeaviateConfig
|
|
50
|
+
_client: t.Optional["Client"] = field(init=False, default=None)
|
|
51
|
+
|
|
52
|
+
def to_dict(self, **kwargs):
|
|
53
|
+
"""
|
|
54
|
+
The _client variable in this dataclass breaks deepcopy due to:
|
|
55
|
+
TypeError: cannot pickle '_thread.lock' object
|
|
56
|
+
When serializing, remove it, meaning client data will need to be reinitialized
|
|
57
|
+
when deserialized
|
|
58
|
+
"""
|
|
59
|
+
self_cp = copy.copy(self)
|
|
60
|
+
if hasattr(self_cp, "_client"):
|
|
61
|
+
setattr(self_cp, "_client", None)
|
|
62
|
+
return _asdict(self_cp, **kwargs)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
66
|
+
def client(self) -> "Client":
|
|
67
|
+
if self._client is None:
|
|
68
|
+
from weaviate import Client
|
|
69
|
+
|
|
70
|
+
auth = self._resolve_auth_method()
|
|
71
|
+
self._client = Client(url=self.connector_config.host_url, auth_client_secret=auth)
|
|
72
|
+
return self._client
|
|
73
|
+
|
|
74
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
75
|
+
@DestinationConnectionError.wrap
|
|
76
|
+
def initialize(self):
|
|
77
|
+
_ = self.client
|
|
78
|
+
|
|
79
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
80
|
+
def check_connection(self):
|
|
81
|
+
try:
|
|
82
|
+
_ = self.client
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
85
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
86
|
+
|
|
87
|
+
def _resolve_auth_method(self):
|
|
88
|
+
access_configs = self.connector_config.access_config
|
|
89
|
+
if access_configs.anonymous:
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
if access_configs.access_token:
|
|
93
|
+
from weaviate.auth import AuthBearerToken
|
|
94
|
+
|
|
95
|
+
return AuthBearerToken(
|
|
96
|
+
access_token=access_configs.access_token,
|
|
97
|
+
refresh_token=access_configs.refresh_token,
|
|
98
|
+
)
|
|
99
|
+
elif access_configs.api_key:
|
|
100
|
+
from weaviate.auth import AuthApiKey
|
|
101
|
+
|
|
102
|
+
return AuthApiKey(api_key=access_configs.api_key)
|
|
103
|
+
elif access_configs.client_secret:
|
|
104
|
+
from weaviate.auth import AuthClientCredentials
|
|
105
|
+
|
|
106
|
+
return AuthClientCredentials(
|
|
107
|
+
client_secret=access_configs.client_secret, scope=access_configs.scope
|
|
108
|
+
)
|
|
109
|
+
elif access_configs.username and access_configs.password:
|
|
110
|
+
from weaviate.auth import AuthClientPassword
|
|
111
|
+
|
|
112
|
+
return AuthClientPassword(
|
|
113
|
+
username=access_configs.username,
|
|
114
|
+
password=access_configs.password,
|
|
115
|
+
scope=access_configs.scope,
|
|
116
|
+
)
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
def conform_dict(self, data: dict) -> None:
|
|
120
|
+
"""
|
|
121
|
+
Updates the element dictionary to conform to the Weaviate schema
|
|
122
|
+
"""
|
|
123
|
+
from dateutil import parser
|
|
124
|
+
|
|
125
|
+
# Dict as string formatting
|
|
126
|
+
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
|
|
127
|
+
# Explicit casting otherwise fails schema type checking
|
|
128
|
+
data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
|
|
129
|
+
|
|
130
|
+
# Array of items as string formatting
|
|
131
|
+
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
132
|
+
data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
|
|
133
|
+
|
|
134
|
+
if links := data.get("metadata", {}).get("links", {}):
|
|
135
|
+
data["metadata"]["links"] = str(json.dumps(links))
|
|
136
|
+
|
|
137
|
+
if permissions_data := (
|
|
138
|
+
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
|
139
|
+
):
|
|
140
|
+
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
|
|
141
|
+
|
|
142
|
+
# Datetime formatting
|
|
143
|
+
if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
|
|
144
|
+
data["metadata"]["data_source"]["date_created"] = parser.parse(date_created).strftime(
|
|
145
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
|
|
149
|
+
data["metadata"]["data_source"]["date_modified"] = parser.parse(date_modified).strftime(
|
|
150
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
|
|
154
|
+
data["metadata"]["data_source"]["date_processed"] = parser.parse(
|
|
155
|
+
date_processed
|
|
156
|
+
).strftime(
|
|
157
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if last_modified := data.get("metadata", {}).get("last_modified", {}):
|
|
161
|
+
data["metadata"]["last_modified"] = parser.parse(last_modified).strftime(
|
|
162
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# String casting
|
|
166
|
+
if version := data.get("metadata", {}).get("data_source", {}).get("version"):
|
|
167
|
+
data["metadata"]["data_source"]["version"] = str(version)
|
|
168
|
+
|
|
169
|
+
if page_number := data.get("metadata", {}).get("page_number"):
|
|
170
|
+
data["metadata"]["page_number"] = str(page_number)
|
|
171
|
+
|
|
172
|
+
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
173
|
+
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
|
174
|
+
|
|
175
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
176
|
+
logger.info(
|
|
177
|
+
f"writing {len(elements_dict)} objects to destination "
|
|
178
|
+
f"class {self.connector_config.class_name} "
|
|
179
|
+
f"at {self.connector_config.host_url}",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
self.client.batch.configure(batch_size=self.write_config.batch_size)
|
|
183
|
+
with self.client.batch as b:
|
|
184
|
+
for e in elements_dict:
|
|
185
|
+
vector = e.pop("embeddings", None)
|
|
186
|
+
b.add_data_object(
|
|
187
|
+
e,
|
|
188
|
+
self.connector_config.class_name,
|
|
189
|
+
vector=vector,
|
|
190
|
+
)
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
6
|
+
from unstructured_ingest.interfaces import (
|
|
7
|
+
BaseConnectorConfig,
|
|
8
|
+
BaseSingleIngestDoc,
|
|
9
|
+
BaseSourceConnector,
|
|
10
|
+
IngestDocCleanupMixin,
|
|
11
|
+
SourceConnectorCleanupMixin,
|
|
12
|
+
SourceMetadata,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
|
|
17
|
+
if t.TYPE_CHECKING:
|
|
18
|
+
from wikipedia import WikipediaPage
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class SimpleWikipediaConfig(BaseConnectorConfig):
|
|
23
|
+
page_title: str
|
|
24
|
+
auto_suggest: bool = False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class WikipediaIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
29
|
+
connector_config: SimpleWikipediaConfig = field(repr=False)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
@requires_dependencies(["wikipedia"], extras="wikipedia")
|
|
33
|
+
def page(self) -> "WikipediaPage":
|
|
34
|
+
import wikipedia
|
|
35
|
+
|
|
36
|
+
return wikipedia.page(
|
|
37
|
+
self.connector_config.page_title,
|
|
38
|
+
auto_suggest=self.connector_config.auto_suggest,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def get_filename_prefix(self) -> str:
|
|
42
|
+
title: str = str(self.connector_config.page_title)
|
|
43
|
+
title = " ".join(title.split()).replace(" ", "-")
|
|
44
|
+
return title
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def filename(self) -> Path:
|
|
48
|
+
raise NotImplementedError()
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def text(self) -> str:
|
|
52
|
+
raise NotImplementedError()
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def _output_filename(self):
|
|
56
|
+
raise NotImplementedError()
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def date_created(self) -> t.Optional[str]:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def date_modified(self) -> t.Optional[str]:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
68
|
+
return {
|
|
69
|
+
"page_title": self.connector_config.page_title,
|
|
70
|
+
"page_url": self.source_metadata.source_url, # type: ignore
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def _create_full_tmp_dir_path(self):
|
|
74
|
+
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
75
|
+
|
|
76
|
+
@requires_dependencies(["wikipedia"], extras="wikipedia")
|
|
77
|
+
def update_source_metadata(self):
|
|
78
|
+
from wikipedia.exceptions import PageError
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
page = self.page
|
|
82
|
+
except PageError:
|
|
83
|
+
self.source_metadata = SourceMetadata(
|
|
84
|
+
exists=False,
|
|
85
|
+
)
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
self.source_metadata = SourceMetadata(
|
|
89
|
+
version=page.revision_id,
|
|
90
|
+
source_url=page.url,
|
|
91
|
+
exists=True,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
@SourceConnectionError.wrap
|
|
95
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
96
|
+
def get_file(self):
|
|
97
|
+
"""Fetches the "remote" doc and stores it locally on the filesystem."""
|
|
98
|
+
self._create_full_tmp_dir_path()
|
|
99
|
+
self.update_source_metadata()
|
|
100
|
+
with open(self.filename, "w", encoding="utf8") as f:
|
|
101
|
+
f.write(self.text)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class WikipediaIngestHTMLDoc(WikipediaIngestDoc):
|
|
106
|
+
registry_name: str = "wikipedia_html"
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def filename(self) -> Path:
|
|
110
|
+
return (
|
|
111
|
+
Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}.html"
|
|
112
|
+
).resolve()
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def text(self):
|
|
116
|
+
return self._get_html()
|
|
117
|
+
|
|
118
|
+
@SourceConnectionNetworkError.wrap
|
|
119
|
+
def _get_html(self):
|
|
120
|
+
return self.page.html()
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def _output_filename(self):
|
|
124
|
+
return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-html.json"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class WikipediaIngestTextDoc(WikipediaIngestDoc):
|
|
129
|
+
registry_name: str = "wikipedia_text"
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def filename(self) -> Path:
|
|
133
|
+
return (Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}.txt").resolve()
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def text(self):
|
|
137
|
+
return self._get_content()
|
|
138
|
+
|
|
139
|
+
@SourceConnectionNetworkError.wrap
|
|
140
|
+
def _get_content(self):
|
|
141
|
+
return self.page.content
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def _output_filename(self):
|
|
145
|
+
return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-txt.json"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclass
|
|
149
|
+
class WikipediaIngestSummaryDoc(WikipediaIngestDoc):
|
|
150
|
+
registry_name: str = "wikipedia_summary"
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def filename(self) -> Path:
|
|
154
|
+
return (
|
|
155
|
+
Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}-summary.txt"
|
|
156
|
+
).resolve()
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def text(self):
|
|
160
|
+
return self._get_summary()
|
|
161
|
+
|
|
162
|
+
@SourceConnectionNetworkError.wrap
|
|
163
|
+
def _get_summary(self):
|
|
164
|
+
return self.page.summary
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def _output_filename(self):
|
|
168
|
+
return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-summary.json"
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@dataclass
|
|
172
|
+
class WikipediaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
173
|
+
connector_config: SimpleWikipediaConfig
|
|
174
|
+
|
|
175
|
+
def initialize(self):
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
@requires_dependencies(["wikipedia"], extras="wikipedia")
|
|
179
|
+
def check_connection(self):
|
|
180
|
+
import wikipedia
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
wikipedia.page(
|
|
184
|
+
self.connector_config.page_title,
|
|
185
|
+
auto_suggest=self.connector_config.auto_suggest,
|
|
186
|
+
)
|
|
187
|
+
except Exception as e:
|
|
188
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
189
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
190
|
+
|
|
191
|
+
def get_ingest_docs(self):
|
|
192
|
+
return [
|
|
193
|
+
WikipediaIngestTextDoc(
|
|
194
|
+
processor_config=self.processor_config,
|
|
195
|
+
connector_config=self.connector_config,
|
|
196
|
+
read_config=self.read_config,
|
|
197
|
+
),
|
|
198
|
+
WikipediaIngestHTMLDoc(
|
|
199
|
+
processor_config=self.processor_config,
|
|
200
|
+
connector_config=self.connector_config,
|
|
201
|
+
read_config=self.read_config,
|
|
202
|
+
),
|
|
203
|
+
WikipediaIngestSummaryDoc(
|
|
204
|
+
processor_config=self.processor_config,
|
|
205
|
+
connector_config=self.connector_config,
|
|
206
|
+
read_config=self.read_config,
|
|
207
|
+
),
|
|
208
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
7
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from openai import AzureOpenAI
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
|
|
14
|
+
api_version: str = Field(description="Azure API version", default="2024-06-01")
|
|
15
|
+
azure_endpoint: str
|
|
16
|
+
embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
|
|
17
|
+
|
|
18
|
+
@requires_dependencies(["openai"], extras="openai")
|
|
19
|
+
def get_client(self) -> "AzureOpenAI":
|
|
20
|
+
from openai import AzureOpenAI
|
|
21
|
+
|
|
22
|
+
return AzureOpenAI(
|
|
23
|
+
api_key=self.api_key.get_secret_value(),
|
|
24
|
+
api_version=self.api_version,
|
|
25
|
+
azure_endpoint=self.azure_endpoint,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
|
|
31
|
+
config: AzureOpenAIEmbeddingConfig
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from contextlib import asynccontextmanager
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import TYPE_CHECKING, AsyncIterable
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, SecretStr
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.embed.interfaces import (
|
|
11
|
+
AsyncBaseEmbeddingEncoder,
|
|
12
|
+
BaseEmbeddingEncoder,
|
|
13
|
+
EmbeddingConfig,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.logger import logger
|
|
16
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
+
from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from botocore.client import BaseClient
|
|
21
|
+
|
|
22
|
+
class BedrockClient(BaseClient):
|
|
23
|
+
def invoke_model(self, body: str, modelId: str, accept: str, contentType: str) -> dict:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
class AsyncBedrockClient(BaseClient):
|
|
27
|
+
async def invoke_model(
|
|
28
|
+
self, body: str, modelId: str, accept: str, contentType: str
|
|
29
|
+
) -> dict:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def conform_query(query: str, provider: str) -> dict:
|
|
34
|
+
# replace newlines, which can negatively affect performance.
|
|
35
|
+
text = query.replace(os.linesep, " ")
|
|
36
|
+
|
|
37
|
+
# format input body for provider
|
|
38
|
+
input_body = {}
|
|
39
|
+
if provider == "cohere":
|
|
40
|
+
if "input_type" not in input_body:
|
|
41
|
+
input_body["input_type"] = "search_document"
|
|
42
|
+
input_body["texts"] = [text]
|
|
43
|
+
else:
|
|
44
|
+
# includes common provider == "amazon"
|
|
45
|
+
input_body["inputText"] = text
|
|
46
|
+
return input_body
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class BedrockEmbeddingConfig(EmbeddingConfig):
|
|
50
|
+
aws_access_key_id: SecretStr
|
|
51
|
+
aws_secret_access_key: SecretStr
|
|
52
|
+
region_name: str = "us-west-2"
|
|
53
|
+
embed_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
|
|
54
|
+
|
|
55
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
56
|
+
from botocore.exceptions import ClientError
|
|
57
|
+
|
|
58
|
+
if isinstance(e, ClientError):
|
|
59
|
+
# https://docs.aws.amazon.com/awssupport/latest/APIReference/CommonErrors.html
|
|
60
|
+
http_response = e.response
|
|
61
|
+
meta = http_response["ResponseMetadata"]
|
|
62
|
+
http_response_code = meta["HTTPStatusCode"]
|
|
63
|
+
error_code = http_response["Error"]["Code"]
|
|
64
|
+
if http_response_code == 400:
|
|
65
|
+
if error_code == "ValidationError":
|
|
66
|
+
return UserError(http_response["Error"])
|
|
67
|
+
elif error_code == "ThrottlingException":
|
|
68
|
+
return RateLimitError(http_response["Error"])
|
|
69
|
+
elif error_code == "NotAuthorized" or error_code == "AccessDeniedException":
|
|
70
|
+
return UserAuthError(http_response["Error"])
|
|
71
|
+
if http_response_code == 403:
|
|
72
|
+
return UserAuthError(http_response["Error"])
|
|
73
|
+
if 400 <= http_response_code < 500:
|
|
74
|
+
return UserError(http_response["Error"])
|
|
75
|
+
if http_response_code >= 500:
|
|
76
|
+
return ProviderError(http_response["Error"])
|
|
77
|
+
|
|
78
|
+
logger.error(f"unhandled exception from bedrock: {e}", exc_info=True)
|
|
79
|
+
return e
|
|
80
|
+
|
|
81
|
+
@requires_dependencies(
|
|
82
|
+
["boto3", "numpy", "botocore"],
|
|
83
|
+
extras="bedrock",
|
|
84
|
+
)
|
|
85
|
+
def get_client(self) -> "BedrockClient":
|
|
86
|
+
import boto3
|
|
87
|
+
|
|
88
|
+
bedrock_client = boto3.client(
|
|
89
|
+
service_name="bedrock-runtime",
|
|
90
|
+
aws_access_key_id=self.aws_access_key_id.get_secret_value(),
|
|
91
|
+
aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
|
|
92
|
+
region_name=self.region_name,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return bedrock_client
|
|
96
|
+
|
|
97
|
+
@requires_dependencies(
|
|
98
|
+
["aioboto3"],
|
|
99
|
+
extras="bedrock",
|
|
100
|
+
)
|
|
101
|
+
@asynccontextmanager
|
|
102
|
+
async def get_async_client(self) -> AsyncIterable["AsyncBedrockClient"]:
|
|
103
|
+
import aioboto3
|
|
104
|
+
|
|
105
|
+
session = aioboto3.Session()
|
|
106
|
+
async with session.client(
|
|
107
|
+
"bedrock-runtime",
|
|
108
|
+
aws_access_key_id=self.aws_access_key_id.get_secret_value(),
|
|
109
|
+
aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
|
|
110
|
+
region_name=self.region_name,
|
|
111
|
+
) as aws_bedrock:
|
|
112
|
+
yield aws_bedrock
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
117
|
+
config: BedrockEmbeddingConfig
|
|
118
|
+
|
|
119
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
120
|
+
return self.config.wrap_error(e=e)
|
|
121
|
+
|
|
122
|
+
def embed_query(self, query: str) -> list[float]:
|
|
123
|
+
"""Call out to Bedrock embedding endpoint."""
|
|
124
|
+
provider = self.config.embed_model_name.split(".")[0]
|
|
125
|
+
body = conform_query(query=query, provider=provider)
|
|
126
|
+
|
|
127
|
+
bedrock_client = self.config.get_client()
|
|
128
|
+
# invoke bedrock API
|
|
129
|
+
try:
|
|
130
|
+
response = bedrock_client.invoke_model(
|
|
131
|
+
body=json.dumps(body),
|
|
132
|
+
modelId=self.config.embed_model_name,
|
|
133
|
+
accept="application/json",
|
|
134
|
+
contentType="application/json",
|
|
135
|
+
)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
raise self.wrap_error(e=e)
|
|
138
|
+
|
|
139
|
+
# format output based on provider
|
|
140
|
+
response_body = json.loads(response.get("body").read())
|
|
141
|
+
if provider == "cohere":
|
|
142
|
+
return response_body.get("embeddings")[0]
|
|
143
|
+
else:
|
|
144
|
+
# includes common provider == "amazon"
|
|
145
|
+
return response_body.get("embedding")
|
|
146
|
+
|
|
147
|
+
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
148
|
+
embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
|
|
149
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
150
|
+
return elements_with_embeddings
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@dataclass
|
|
154
|
+
class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
155
|
+
config: BedrockEmbeddingConfig
|
|
156
|
+
|
|
157
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
158
|
+
return self.config.wrap_error(e=e)
|
|
159
|
+
|
|
160
|
+
async def embed_query(self, query: str) -> list[float]:
|
|
161
|
+
"""Call out to Bedrock embedding endpoint."""
|
|
162
|
+
provider = self.config.embed_model_name.split(".")[0]
|
|
163
|
+
body = conform_query(query=query, provider=provider)
|
|
164
|
+
try:
|
|
165
|
+
async with self.config.get_async_client() as bedrock_client:
|
|
166
|
+
# invoke bedrock API
|
|
167
|
+
try:
|
|
168
|
+
response = await bedrock_client.invoke_model(
|
|
169
|
+
body=json.dumps(body),
|
|
170
|
+
modelId=self.config.embed_model_name,
|
|
171
|
+
accept="application/json",
|
|
172
|
+
contentType="application/json",
|
|
173
|
+
)
|
|
174
|
+
except Exception as e:
|
|
175
|
+
raise self.wrap_error(e=e)
|
|
176
|
+
async with response.get("body") as client_response:
|
|
177
|
+
response_body = await client_response.json()
|
|
178
|
+
|
|
179
|
+
# format output based on provider
|
|
180
|
+
if provider == "cohere":
|
|
181
|
+
return response_body.get("embeddings")[0]
|
|
182
|
+
else:
|
|
183
|
+
# includes common provider == "amazon"
|
|
184
|
+
return response_body.get("embedding")
|
|
185
|
+
except Exception as e:
|
|
186
|
+
raise ValueError(f"Error raised by inference endpoint: {e}")
|
|
187
|
+
|
|
188
|
+
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
189
|
+
embeddings = await asyncio.gather(
|
|
190
|
+
*[self.embed_query(query=e.get("text", "")) for e in elements]
|
|
191
|
+
)
|
|
192
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
193
|
+
return elements_with_embeddings
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
7
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from sentence_transformers import SentenceTransformer
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
14
|
+
embedder_model_name: Optional[str] = Field(
|
|
15
|
+
default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name"
|
|
16
|
+
)
|
|
17
|
+
embedder_model_kwargs: Optional[dict] = Field(
|
|
18
|
+
default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
|
|
19
|
+
)
|
|
20
|
+
encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
|
|
21
|
+
cache_folder: Optional[str] = Field(default=None)
|
|
22
|
+
|
|
23
|
+
@requires_dependencies(
|
|
24
|
+
["sentence_transformers"],
|
|
25
|
+
extras="embed-huggingface",
|
|
26
|
+
)
|
|
27
|
+
def get_client(self) -> "SentenceTransformer":
|
|
28
|
+
from sentence_transformers import SentenceTransformer
|
|
29
|
+
|
|
30
|
+
return SentenceTransformer(
|
|
31
|
+
model_name_or_path=self.embedder_model_name,
|
|
32
|
+
cache_folder=self.cache_folder,
|
|
33
|
+
**self.embedder_model_kwargs,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
39
|
+
config: HuggingFaceEmbeddingConfig
|
|
40
|
+
|
|
41
|
+
def embed_query(self, query: str) -> list[float]:
|
|
42
|
+
return self._embed_documents(texts=[query])[0]
|
|
43
|
+
|
|
44
|
+
def _embed_documents(self, texts: list[str]) -> list[list[float]]:
|
|
45
|
+
client = self.config.get_client()
|
|
46
|
+
embeddings = client.encode(texts, **self.config.encode_kwargs)
|
|
47
|
+
return embeddings.tolist()
|
|
48
|
+
|
|
49
|
+
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
50
|
+
embeddings = self._embed_documents([e.get("text", "") for e in elements])
|
|
51
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
52
|
+
return elements_with_embeddings
|