unstructured-ingest 1.0.13__tar.gz → 1.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/PKG-INFO +1 -1
- unstructured_ingest-1.0.14/unstructured_ingest/__version__.py +1 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/bedrock.py +51 -16
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/huggingface.py +1 -1
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/interfaces.py +3 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/mixedbreadai.py +2 -2
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/octoai.py +19 -2
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/openai.py +34 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/togetherai.py +17 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/vertexai.py +1 -1
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/google_drive.py +16 -39
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/embedder.py +4 -0
- unstructured_ingest-1.0.13/unstructured_ingest/__version__.py +0 -1
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/.gitignore +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/LICENSE.md +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/README.md +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/pyproject.toml +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/README.md +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/base/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/base/cmd.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/base/dest.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/base/importer.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/base/src.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/cli.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/cmds.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/utils/click.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/data_types/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/data_types/entities.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/data_types/file_data.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/azure_openai.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/voyageai.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/error.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/errors_v2.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/connector.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/downloader.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/indexer.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/process.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/processor.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/upload_stager.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/uploader.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/logger.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/main.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/otel.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/interfaces.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/otel.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/pipeline.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/steps/download.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/steps/embed.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/steps/filter.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/steps/index.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/steps/partition.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/steps/stage.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/steps/upload.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/chunker.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connector_registry.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/airtable.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/astradb.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/chroma.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/confluence.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/discord.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/github.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/jira.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/local.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/milvus.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/onedrive.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/outlook.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/pinecone.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/sharepoint.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/slack.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/utils.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/vectara.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/filter.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/partitioner.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/uncompress.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/unstructured_api.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/chunking.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/compression.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/constants.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/data_prep.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/dep_check.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/html.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/ndjson.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/pydantic_models.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
- {unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/utils/table.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.14" # pragma: no cover
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/bedrock.py
RENAMED
|
@@ -26,16 +26,20 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
27
27
|
from botocore.client import BaseClient
|
|
28
28
|
|
|
29
|
-
class
|
|
29
|
+
class BedrockRuntimeClient(BaseClient):
|
|
30
30
|
def invoke_model(self, body: str, modelId: str, accept: str, contentType: str) -> dict:
|
|
31
31
|
pass
|
|
32
32
|
|
|
33
|
-
class
|
|
33
|
+
class AsyncBedrockRuntimeClient(BaseClient):
|
|
34
34
|
async def invoke_model(
|
|
35
35
|
self, body: str, modelId: str, accept: str, contentType: str
|
|
36
36
|
) -> dict:
|
|
37
37
|
pass
|
|
38
38
|
|
|
39
|
+
class BedrockClient(BaseClient):
|
|
40
|
+
def list_foundation_models(self, byOutputModality: str) -> dict:
|
|
41
|
+
pass
|
|
42
|
+
|
|
39
43
|
|
|
40
44
|
def conform_query(query: str, provider: str) -> dict:
|
|
41
45
|
# replace newlines, which can negatively affect performance.
|
|
@@ -87,19 +91,49 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
|
|
|
87
91
|
logger.error(f"unhandled exception from bedrock: {e}", exc_info=True)
|
|
88
92
|
return e
|
|
89
93
|
|
|
94
|
+
def run_precheck(self) -> None:
|
|
95
|
+
client = self.get_bedrock_client()
|
|
96
|
+
try:
|
|
97
|
+
model_info = client.list_foundation_models(byOutputModality="EMBEDDING")
|
|
98
|
+
summaries = model_info.get("modelSummaries", [])
|
|
99
|
+
model_ids = [m["modelId"].split(":")[0] for m in summaries]
|
|
100
|
+
arns = [":".join(m["modelArn"].split(":")[0:-1]) for m in summaries]
|
|
101
|
+
|
|
102
|
+
if self.embedder_model_name not in model_ids and self.embedder_model_name not in arns:
|
|
103
|
+
raise UserError(
|
|
104
|
+
"model '{}' not found either : {} or {}".format(
|
|
105
|
+
self.embedder_model_name, ", ".join(model_ids), ", ".join(arns)
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
except Exception as e:
|
|
109
|
+
raise self.wrap_error(e=e)
|
|
110
|
+
|
|
111
|
+
def get_client_kwargs(self) -> dict:
|
|
112
|
+
return {
|
|
113
|
+
"aws_access_key_id": self.aws_access_key_id.get_secret_value(),
|
|
114
|
+
"aws_secret_access_key": self.aws_secret_access_key.get_secret_value(),
|
|
115
|
+
"region_name": self.region_name,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
@requires_dependencies(
|
|
119
|
+
["boto3"],
|
|
120
|
+
extras="bedrock",
|
|
121
|
+
)
|
|
122
|
+
def get_bedrock_client(self) -> "BedrockClient":
|
|
123
|
+
import boto3
|
|
124
|
+
|
|
125
|
+
bedrock_client = boto3.client(service_name="bedrock", **self.get_client_kwargs())
|
|
126
|
+
|
|
127
|
+
return bedrock_client
|
|
128
|
+
|
|
90
129
|
@requires_dependencies(
|
|
91
130
|
["boto3", "numpy", "botocore"],
|
|
92
131
|
extras="bedrock",
|
|
93
132
|
)
|
|
94
|
-
def get_client(self) -> "
|
|
133
|
+
def get_client(self) -> "BedrockRuntimeClient":
|
|
95
134
|
import boto3
|
|
96
135
|
|
|
97
|
-
bedrock_client = boto3.client(
|
|
98
|
-
service_name="bedrock-runtime",
|
|
99
|
-
aws_access_key_id=self.aws_access_key_id.get_secret_value(),
|
|
100
|
-
aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
|
|
101
|
-
region_name=self.region_name,
|
|
102
|
-
)
|
|
136
|
+
bedrock_client = boto3.client(service_name="bedrock-runtime", **self.get_client_kwargs())
|
|
103
137
|
|
|
104
138
|
return bedrock_client
|
|
105
139
|
|
|
@@ -108,16 +142,11 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
|
|
|
108
142
|
extras="bedrock",
|
|
109
143
|
)
|
|
110
144
|
@asynccontextmanager
|
|
111
|
-
async def get_async_client(self) -> AsyncIterable["
|
|
145
|
+
async def get_async_client(self) -> AsyncIterable["AsyncBedrockRuntimeClient"]:
|
|
112
146
|
import aioboto3
|
|
113
147
|
|
|
114
148
|
session = aioboto3.Session()
|
|
115
|
-
async with session.client(
|
|
116
|
-
"bedrock-runtime",
|
|
117
|
-
aws_access_key_id=self.aws_access_key_id.get_secret_value(),
|
|
118
|
-
aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
|
|
119
|
-
region_name=self.region_name,
|
|
120
|
-
) as aws_bedrock:
|
|
149
|
+
async with session.client("bedrock-runtime", **self.get_client_kwargs()) as aws_bedrock:
|
|
121
150
|
yield aws_bedrock
|
|
122
151
|
|
|
123
152
|
|
|
@@ -125,6 +154,9 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
|
|
|
125
154
|
class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
126
155
|
config: BedrockEmbeddingConfig
|
|
127
156
|
|
|
157
|
+
def precheck(self):
|
|
158
|
+
self.config.run_precheck()
|
|
159
|
+
|
|
128
160
|
def wrap_error(self, e: Exception) -> Exception:
|
|
129
161
|
return self.config.wrap_error(e=e)
|
|
130
162
|
|
|
@@ -168,6 +200,9 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
168
200
|
class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
169
201
|
config: BedrockEmbeddingConfig
|
|
170
202
|
|
|
203
|
+
def precheck(self):
|
|
204
|
+
self.config.run_precheck()
|
|
205
|
+
|
|
171
206
|
def wrap_error(self, e: Exception) -> Exception:
|
|
172
207
|
return self.config.wrap_error(e=e)
|
|
173
208
|
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/huggingface.py
RENAMED
|
@@ -23,7 +23,7 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
|
23
23
|
|
|
24
24
|
@requires_dependencies(
|
|
25
25
|
["sentence_transformers"],
|
|
26
|
-
extras="
|
|
26
|
+
extras="huggingface",
|
|
27
27
|
)
|
|
28
28
|
def get_client(self) -> "SentenceTransformer":
|
|
29
29
|
from sentence_transformers import SentenceTransformer
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/interfaces.py
RENAMED
|
@@ -20,6 +20,9 @@ class EmbeddingConfig(BaseModel):
|
|
|
20
20
|
class BaseEncoder(ABC):
|
|
21
21
|
config: EmbeddingConfig
|
|
22
22
|
|
|
23
|
+
def precheck(self):
|
|
24
|
+
pass
|
|
25
|
+
|
|
23
26
|
def initialize(self):
|
|
24
27
|
"""Initializes the embedding encoder class. Should also validate the instance
|
|
25
28
|
is properly configured: e.g., embed a single a element"""
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/mixedbreadai.py
RENAMED
|
@@ -42,7 +42,7 @@ class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
|
|
|
42
42
|
|
|
43
43
|
@requires_dependencies(
|
|
44
44
|
["mixedbread_ai"],
|
|
45
|
-
extras="
|
|
45
|
+
extras="mixedbreadai",
|
|
46
46
|
)
|
|
47
47
|
def get_client(self) -> "MixedbreadAI":
|
|
48
48
|
"""
|
|
@@ -59,7 +59,7 @@ class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
|
|
|
59
59
|
|
|
60
60
|
@requires_dependencies(
|
|
61
61
|
["mixedbread_ai"],
|
|
62
|
-
extras="
|
|
62
|
+
extras="mixedbreadai",
|
|
63
63
|
)
|
|
64
64
|
def get_async_client(self) -> "AsyncMixedbreadAI":
|
|
65
65
|
from mixedbread_ai.client import AsyncMixedbreadAI
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/octoai.py
RENAMED
|
@@ -54,9 +54,20 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
|
|
|
54
54
|
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
55
55
|
return e
|
|
56
56
|
|
|
57
|
+
def run_precheck(self) -> None:
|
|
58
|
+
client = self.get_client()
|
|
59
|
+
try:
|
|
60
|
+
models = [m.id for m in list(client.models.list())]
|
|
61
|
+
if self.embedder_model_name not in models:
|
|
62
|
+
raise UserError(
|
|
63
|
+
"model '{}' not found: {}".format(self.embedder_model_name, ", ".join(models))
|
|
64
|
+
)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
raise self.wrap_error(e=e)
|
|
67
|
+
|
|
57
68
|
@requires_dependencies(
|
|
58
69
|
["openai", "tiktoken"],
|
|
59
|
-
extras="
|
|
70
|
+
extras="octoai",
|
|
60
71
|
)
|
|
61
72
|
def get_client(self) -> "OpenAI":
|
|
62
73
|
"""Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
|
|
@@ -66,7 +77,7 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
|
|
|
66
77
|
|
|
67
78
|
@requires_dependencies(
|
|
68
79
|
["openai", "tiktoken"],
|
|
69
|
-
extras="
|
|
80
|
+
extras="octoai",
|
|
70
81
|
)
|
|
71
82
|
def get_async_client(self) -> "AsyncOpenAI":
|
|
72
83
|
"""Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
|
|
@@ -79,6 +90,9 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
|
|
|
79
90
|
class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
80
91
|
config: OctoAiEmbeddingConfig
|
|
81
92
|
|
|
93
|
+
def precheck(self):
|
|
94
|
+
self.config.run_precheck()
|
|
95
|
+
|
|
82
96
|
def wrap_error(self, e: Exception) -> Exception:
|
|
83
97
|
return self.config.wrap_error(e=e)
|
|
84
98
|
|
|
@@ -99,6 +113,9 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
99
113
|
class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
100
114
|
config: OctoAiEmbeddingConfig
|
|
101
115
|
|
|
116
|
+
def precheck(self):
|
|
117
|
+
self.config.run_precheck()
|
|
118
|
+
|
|
102
119
|
def wrap_error(self, e: Exception) -> Exception:
|
|
103
120
|
return self.config.wrap_error(e=e)
|
|
104
121
|
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/openai.py
RENAMED
|
@@ -28,6 +28,7 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
|
28
28
|
embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
|
|
29
29
|
base_url: Optional[str] = None
|
|
30
30
|
|
|
31
|
+
@requires_dependencies(["openai"], extras="openai")
|
|
31
32
|
def wrap_error(self, e: Exception) -> Exception:
|
|
32
33
|
if is_internal_error(e=e):
|
|
33
34
|
return e
|
|
@@ -54,6 +55,33 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
|
54
55
|
logger.error(f"unhandled exception from openai: {e}", exc_info=True)
|
|
55
56
|
return e
|
|
56
57
|
|
|
58
|
+
@requires_dependencies(["openai"], extras="openai")
|
|
59
|
+
def get_models(self) -> Optional[list[str]]:
|
|
60
|
+
# In case the list model endpoint isn't exposed, don't break
|
|
61
|
+
from openai import APIStatusError
|
|
62
|
+
|
|
63
|
+
client = self.get_client()
|
|
64
|
+
try:
|
|
65
|
+
models = [m.id for m in list(client.models.list())]
|
|
66
|
+
return models
|
|
67
|
+
except APIStatusError as e:
|
|
68
|
+
if e.status_code == 404:
|
|
69
|
+
return None
|
|
70
|
+
except Exception as e:
|
|
71
|
+
raise self.wrap_error(e=e)
|
|
72
|
+
|
|
73
|
+
def run_precheck(self) -> None:
|
|
74
|
+
try:
|
|
75
|
+
models = self.get_models()
|
|
76
|
+
if models is None:
|
|
77
|
+
return
|
|
78
|
+
if self.embedder_model_name not in models:
|
|
79
|
+
raise UserError(
|
|
80
|
+
"model '{}' not found: {}".format(self.embedder_model_name, ", ".join(models))
|
|
81
|
+
)
|
|
82
|
+
except Exception as e:
|
|
83
|
+
raise self.wrap_error(e=e)
|
|
84
|
+
|
|
57
85
|
@requires_dependencies(["openai"], extras="openai")
|
|
58
86
|
def get_client(self) -> "OpenAI":
|
|
59
87
|
from openai import OpenAI
|
|
@@ -71,6 +99,9 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
|
71
99
|
class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
72
100
|
config: OpenAIEmbeddingConfig
|
|
73
101
|
|
|
102
|
+
def precheck(self):
|
|
103
|
+
self.config.run_precheck()
|
|
104
|
+
|
|
74
105
|
def wrap_error(self, e: Exception) -> Exception:
|
|
75
106
|
return self.config.wrap_error(e=e)
|
|
76
107
|
|
|
@@ -86,6 +117,9 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
86
117
|
class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
87
118
|
config: OpenAIEmbeddingConfig
|
|
88
119
|
|
|
120
|
+
def precheck(self):
|
|
121
|
+
self.config.run_precheck()
|
|
122
|
+
|
|
89
123
|
def wrap_error(self, e: Exception) -> Exception:
|
|
90
124
|
return self.config.wrap_error(e=e)
|
|
91
125
|
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/togetherai.py
RENAMED
|
@@ -41,6 +41,17 @@ class TogetherAIEmbeddingConfig(EmbeddingConfig):
|
|
|
41
41
|
return CustomRateLimitError(message)
|
|
42
42
|
return UserError(message)
|
|
43
43
|
|
|
44
|
+
def run_precheck(self) -> None:
|
|
45
|
+
client = self.get_client()
|
|
46
|
+
try:
|
|
47
|
+
models = [m.id for m in list(client.models.list())]
|
|
48
|
+
if self.embedder_model_name not in models:
|
|
49
|
+
raise UserError(
|
|
50
|
+
"model '{}' not found: {}".format(self.embedder_model_name, ", ".join(models))
|
|
51
|
+
)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
raise self.wrap_error(e=e)
|
|
54
|
+
|
|
44
55
|
@requires_dependencies(["together"], extras="togetherai")
|
|
45
56
|
def get_client(self) -> "Together":
|
|
46
57
|
from together import Together
|
|
@@ -58,6 +69,9 @@ class TogetherAIEmbeddingConfig(EmbeddingConfig):
|
|
|
58
69
|
class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
59
70
|
config: TogetherAIEmbeddingConfig
|
|
60
71
|
|
|
72
|
+
def precheck(self):
|
|
73
|
+
self.config.run_precheck()
|
|
74
|
+
|
|
61
75
|
def wrap_error(self, e: Exception) -> Exception:
|
|
62
76
|
return self.config.wrap_error(e=e)
|
|
63
77
|
|
|
@@ -73,6 +87,9 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
73
87
|
class AsyncTogetherAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
74
88
|
config: TogetherAIEmbeddingConfig
|
|
75
89
|
|
|
90
|
+
def precheck(self):
|
|
91
|
+
self.config.run_precheck()
|
|
92
|
+
|
|
76
93
|
def wrap_error(self, e: Exception) -> Exception:
|
|
77
94
|
return self.config.wrap_error(e=e)
|
|
78
95
|
|
|
@@ -38,12 +38,9 @@ CONNECTOR_TYPE = "google_drive"
|
|
|
38
38
|
|
|
39
39
|
# Maps Google-native Drive MIME types → export MIME types
|
|
40
40
|
GOOGLE_EXPORT_MIME_MAP = {
|
|
41
|
-
"application/vnd.google-apps.document":
|
|
42
|
-
|
|
43
|
-
"application/vnd.google-apps.
|
|
44
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
45
|
-
"application/vnd.google-apps.presentation": \
|
|
46
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
41
|
+
"application/vnd.google-apps.document": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # noqa: E501
|
|
42
|
+
"application/vnd.google-apps.spreadsheet": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # noqa: E501
|
|
43
|
+
"application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation", # noqa: E501
|
|
47
44
|
}
|
|
48
45
|
|
|
49
46
|
# Maps export MIME types → file extensions
|
|
@@ -57,9 +54,9 @@ EXPORT_EXTENSION_MAP = {
|
|
|
57
54
|
|
|
58
55
|
|
|
59
56
|
class GoogleDriveAccessConfig(AccessConfig):
|
|
60
|
-
service_account_key: Optional[
|
|
61
|
-
|
|
62
|
-
|
|
57
|
+
service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
|
|
58
|
+
default=None, description="Credentials values to use for authentication"
|
|
59
|
+
)
|
|
63
60
|
service_account_key_path: Optional[Path] = Field(
|
|
64
61
|
default=None,
|
|
65
62
|
description="File path to credentials values to use for authentication",
|
|
@@ -178,14 +175,10 @@ class GoogleDriveIndexer(Indexer):
|
|
|
178
175
|
Please enable it in the Google Cloud Console."
|
|
179
176
|
)
|
|
180
177
|
else:
|
|
181
|
-
raise SourceConnectionError(
|
|
182
|
-
"Google drive API unreachable for an unknown reason!"
|
|
183
|
-
)
|
|
178
|
+
raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
|
|
184
179
|
|
|
185
180
|
@staticmethod
|
|
186
|
-
def count_files_recursively(
|
|
187
|
-
files_client, folder_id: str, extensions: list[str] = None
|
|
188
|
-
) -> int:
|
|
181
|
+
def count_files_recursively(files_client, folder_id: str, extensions: list[str] = None) -> int:
|
|
189
182
|
"""
|
|
190
183
|
Count non-folder files recursively under the given folder.
|
|
191
184
|
If `extensions` is provided, only count files
|
|
@@ -266,9 +259,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
266
259
|
# that the service account has proper permissions."
|
|
267
260
|
# )
|
|
268
261
|
else:
|
|
269
|
-
logger.info(
|
|
270
|
-
f"Found {file_count} files recursively in the folder."
|
|
271
|
-
)
|
|
262
|
+
logger.info(f"Found {file_count} files recursively in the folder.")
|
|
272
263
|
else:
|
|
273
264
|
# Non-recursive: check for at least one immediate non-folder child.
|
|
274
265
|
response = client.list(
|
|
@@ -319,9 +310,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
319
310
|
date_modified_str = root_info.pop("modifiedTime", None)
|
|
320
311
|
parent_path = root_info.pop("parent_path", None)
|
|
321
312
|
parent_root_path = root_info.pop("parent_root_path", None)
|
|
322
|
-
date_modified_dt = (
|
|
323
|
-
parser.parse(date_modified_str) if date_modified_str else None
|
|
324
|
-
)
|
|
313
|
+
date_modified_dt = parser.parse(date_modified_str) if date_modified_str else None
|
|
325
314
|
if (
|
|
326
315
|
parent_path
|
|
327
316
|
and isinstance(parent_path, str)
|
|
@@ -421,9 +410,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
421
410
|
) -> list[FileData]:
|
|
422
411
|
root_info = self.get_root_info(files_client=files_client, object_id=object_id)
|
|
423
412
|
if not self.is_dir(root_info):
|
|
424
|
-
root_info["permissions"] = self.extract_permissions(
|
|
425
|
-
root_info.get("permissions")
|
|
426
|
-
)
|
|
413
|
+
root_info["permissions"] = self.extract_permissions(root_info.get("permissions"))
|
|
427
414
|
data = [self.map_file_data(root_info)]
|
|
428
415
|
else:
|
|
429
416
|
file_contents = self.get_paginated_results(
|
|
@@ -514,9 +501,7 @@ class GoogleDriveDownloader(Downloader):
|
|
|
514
501
|
)
|
|
515
502
|
connector_type: str = CONNECTOR_TYPE
|
|
516
503
|
|
|
517
|
-
def _get_download_url_and_ext(
|
|
518
|
-
self, file_id: str, mime_type: str
|
|
519
|
-
) -> tuple[str, str]:
|
|
504
|
+
def _get_download_url_and_ext(self, file_id: str, mime_type: str) -> tuple[str, str]:
|
|
520
505
|
"""
|
|
521
506
|
Resolves the appropriate download URL and expected file extension for a Google Drive file.
|
|
522
507
|
|
|
@@ -530,9 +515,7 @@ class GoogleDriveDownloader(Downloader):
|
|
|
530
515
|
SourceConnectionError: If no valid export or download link is available.
|
|
531
516
|
"""
|
|
532
517
|
with self.connection_config.get_client() as client:
|
|
533
|
-
metadata = client.get(
|
|
534
|
-
fileId=file_id, fields="exportLinks,webContentLink"
|
|
535
|
-
).execute()
|
|
518
|
+
metadata = client.get(fileId=file_id, fields="exportLinks,webContentLink").execute()
|
|
536
519
|
|
|
537
520
|
export_links = metadata.get("exportLinks", {})
|
|
538
521
|
web_link = metadata.get("webContentLink")
|
|
@@ -540,16 +523,12 @@ class GoogleDriveDownloader(Downloader):
|
|
|
540
523
|
if export_mime := GOOGLE_EXPORT_MIME_MAP.get(mime_type):
|
|
541
524
|
url = export_links.get(export_mime)
|
|
542
525
|
if not url:
|
|
543
|
-
raise SourceConnectionError(
|
|
544
|
-
f"No export link found for {file_id} as {export_mime}"
|
|
545
|
-
)
|
|
526
|
+
raise SourceConnectionError(f"No export link found for {file_id} as {export_mime}")
|
|
546
527
|
ext = EXPORT_EXTENSION_MAP.get(export_mime, "")
|
|
547
528
|
return url, ext
|
|
548
529
|
|
|
549
530
|
if not web_link:
|
|
550
|
-
raise SourceConnectionError(
|
|
551
|
-
f"No webContentLink available for file {file_id}"
|
|
552
|
-
)
|
|
531
|
+
raise SourceConnectionError(f"No webContentLink available for file {file_id}")
|
|
553
532
|
return web_link, ""
|
|
554
533
|
|
|
555
534
|
@requires_dependencies(["httpx", "google.auth"], extras="google-drive")
|
|
@@ -622,9 +601,7 @@ class GoogleDriveDownloader(Downloader):
|
|
|
622
601
|
)
|
|
623
602
|
file_data.local_download_path = str(download_path.resolve())
|
|
624
603
|
|
|
625
|
-
return self.generate_download_response(
|
|
626
|
-
file_data=file_data, download_path=download_path
|
|
627
|
-
)
|
|
604
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
628
605
|
|
|
629
606
|
|
|
630
607
|
google_drive_source_entry = SourceRegistryEntry(
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/embedder.py
RENAMED
|
@@ -189,6 +189,10 @@ class Embedder(BaseProcess, ABC):
|
|
|
189
189
|
def init(self, **kwargs: Any) -> None:
|
|
190
190
|
self.config.get_embedder().initialize()
|
|
191
191
|
|
|
192
|
+
def precheck(self) -> None:
|
|
193
|
+
embedder = self.config.get_embedder()
|
|
194
|
+
embedder.precheck()
|
|
195
|
+
|
|
192
196
|
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
193
197
|
# TODO update base embedder classes to support async
|
|
194
198
|
embedder = self.config.get_embedder()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.0.13" # pragma: no cover
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/base/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/base/cmd.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/base/dest.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/base/importer.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/base/src.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/utils/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/cli/utils/click.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/data_types/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/data_types/entities.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/azure_openai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/embed/voyageai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/indexer.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/process.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/interfaces/uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/interfaces.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/otel.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/pipeline/pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.13 → unstructured_ingest-1.0.14}/unstructured_ingest/processes/chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|