unstructured-ingest 1.0.8__tar.gz → 1.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/PKG-INFO +1 -1
- unstructured_ingest-1.0.12/unstructured_ingest/__version__.py +1 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/google_drive.py +171 -65
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +36 -6
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/pinecone.py +26 -0
- unstructured_ingest-1.0.8/unstructured_ingest/__version__.py +0 -1
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/.gitignore +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/LICENSE.md +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/README.md +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/pyproject.toml +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/README.md +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/cmd.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/dest.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/importer.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/src.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/cli.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/cmds.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/utils/click.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/data_types/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/data_types/entities.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/data_types/file_data.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/azure_openai.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/bedrock.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/huggingface.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/interfaces.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/mixedbreadai.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/octoai.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/openai.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/togetherai.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/vertexai.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/voyageai.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/error.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/errors_v2.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/connector.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/downloader.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/indexer.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/process.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/processor.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/upload_stager.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/uploader.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/logger.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/main.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/otel.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/interfaces.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/otel.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/pipeline.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/download.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/embed.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/filter.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/index.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/partition.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/stage.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/upload.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/chunker.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connector_registry.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/airtable.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/astradb.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/chroma.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/confluence.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/discord.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/github.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/jira.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/local.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/milvus.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/onedrive.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/outlook.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sharepoint.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/slack.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/utils.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/vectara.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/embedder.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/filter.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/partitioner.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/uncompress.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/unstructured_api.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/chunking.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/compression.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/constants.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/data_prep.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/dep_check.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/html.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/ndjson.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/pydantic_models.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
- {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/table.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.12" # pragma: no cover
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import io
|
|
2
1
|
import json
|
|
3
2
|
from contextlib import contextmanager
|
|
4
3
|
from dataclasses import dataclass, field
|
|
@@ -16,7 +15,6 @@ from unstructured_ingest.data_types.file_data import (
|
|
|
16
15
|
)
|
|
17
16
|
from unstructured_ingest.error import (
|
|
18
17
|
SourceConnectionError,
|
|
19
|
-
SourceConnectionNetworkError,
|
|
20
18
|
)
|
|
21
19
|
from unstructured_ingest.interfaces import (
|
|
22
20
|
AccessConfig,
|
|
@@ -34,25 +32,34 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
34
32
|
|
|
35
33
|
if TYPE_CHECKING:
|
|
36
34
|
from googleapiclient.discovery import Resource as GoogleAPIResource
|
|
37
|
-
from googleapiclient.http import MediaIoBaseDownload
|
|
38
35
|
|
|
39
36
|
CONNECTOR_TYPE = "google_drive"
|
|
40
37
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
"application/vnd.google-apps.
|
|
45
|
-
|
|
46
|
-
"application/vnd.google-apps.
|
|
47
|
-
|
|
48
|
-
"application/vnd.google-apps.
|
|
38
|
+
|
|
39
|
+
# Maps Google-native Drive MIME types → export MIME types
|
|
40
|
+
GOOGLE_EXPORT_MIME_MAP = {
|
|
41
|
+
"application/vnd.google-apps.document": \
|
|
42
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
43
|
+
"application/vnd.google-apps.spreadsheet": \
|
|
44
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
45
|
+
"application/vnd.google-apps.presentation": \
|
|
46
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Maps export MIME types → file extensions
|
|
50
|
+
EXPORT_EXTENSION_MAP = {
|
|
51
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
52
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
53
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
54
|
+
"application/pdf": ".pdf",
|
|
55
|
+
"text/html": ".html",
|
|
49
56
|
}
|
|
50
57
|
|
|
51
58
|
|
|
52
59
|
class GoogleDriveAccessConfig(AccessConfig):
|
|
53
|
-
service_account_key: Optional[
|
|
54
|
-
|
|
55
|
-
)
|
|
60
|
+
service_account_key: Optional[
|
|
61
|
+
Annotated[dict, BeforeValidator(conform_string_to_dict)]
|
|
62
|
+
] = Field(default=None, description="Credentials values to use for authentication")
|
|
56
63
|
service_account_key_path: Optional[Path] = Field(
|
|
57
64
|
default=None,
|
|
58
65
|
description="File path to credentials values to use for authentication",
|
|
@@ -153,7 +160,13 @@ class GoogleDriveIndexer(Indexer):
|
|
|
153
160
|
"""
|
|
154
161
|
try:
|
|
155
162
|
# A very minimal call: list 1 file from the drive.
|
|
156
|
-
client.list(
|
|
163
|
+
client.list(
|
|
164
|
+
supportsAllDrives=True,
|
|
165
|
+
includeItemsFromAllDrives=True,
|
|
166
|
+
spaces="drive",
|
|
167
|
+
pageSize=1,
|
|
168
|
+
fields="files(id)",
|
|
169
|
+
).execute()
|
|
157
170
|
except HttpError as e:
|
|
158
171
|
error_content = e.content.decode() if hasattr(e, "content") else ""
|
|
159
172
|
lower_error = error_content.lower()
|
|
@@ -165,10 +178,14 @@ class GoogleDriveIndexer(Indexer):
|
|
|
165
178
|
Please enable it in the Google Cloud Console."
|
|
166
179
|
)
|
|
167
180
|
else:
|
|
168
|
-
raise SourceConnectionError(
|
|
181
|
+
raise SourceConnectionError(
|
|
182
|
+
"Google drive API unreachable for an unknown reason!"
|
|
183
|
+
)
|
|
169
184
|
|
|
170
185
|
@staticmethod
|
|
171
|
-
def count_files_recursively(
|
|
186
|
+
def count_files_recursively(
|
|
187
|
+
files_client, folder_id: str, extensions: list[str] = None
|
|
188
|
+
) -> int:
|
|
172
189
|
"""
|
|
173
190
|
Count non-folder files recursively under the given folder.
|
|
174
191
|
If `extensions` is provided, only count files
|
|
@@ -183,6 +200,8 @@ class GoogleDriveIndexer(Indexer):
|
|
|
183
200
|
page_token = None
|
|
184
201
|
while True:
|
|
185
202
|
response = files_client.list(
|
|
203
|
+
supportsAllDrives=True,
|
|
204
|
+
includeItemsFromAllDrives=True,
|
|
186
205
|
spaces="drive",
|
|
187
206
|
q=query,
|
|
188
207
|
fields="nextPageToken, files(id, mimeType, fileExtension)",
|
|
@@ -247,10 +266,14 @@ class GoogleDriveIndexer(Indexer):
|
|
|
247
266
|
# that the service account has proper permissions."
|
|
248
267
|
# )
|
|
249
268
|
else:
|
|
250
|
-
logger.info(
|
|
269
|
+
logger.info(
|
|
270
|
+
f"Found {file_count} files recursively in the folder."
|
|
271
|
+
)
|
|
251
272
|
else:
|
|
252
273
|
# Non-recursive: check for at least one immediate non-folder child.
|
|
253
274
|
response = client.list(
|
|
275
|
+
supportsAllDrives=True,
|
|
276
|
+
includeItemsFromAllDrives=True,
|
|
254
277
|
spaces="drive",
|
|
255
278
|
fields="files(id)",
|
|
256
279
|
pageSize=1,
|
|
@@ -296,7 +319,9 @@ class GoogleDriveIndexer(Indexer):
|
|
|
296
319
|
date_modified_str = root_info.pop("modifiedTime", None)
|
|
297
320
|
parent_path = root_info.pop("parent_path", None)
|
|
298
321
|
parent_root_path = root_info.pop("parent_root_path", None)
|
|
299
|
-
date_modified_dt =
|
|
322
|
+
date_modified_dt = (
|
|
323
|
+
parser.parse(date_modified_str) if date_modified_str else None
|
|
324
|
+
)
|
|
300
325
|
if (
|
|
301
326
|
parent_path
|
|
302
327
|
and isinstance(parent_path, str)
|
|
@@ -348,6 +373,8 @@ class GoogleDriveIndexer(Indexer):
|
|
|
348
373
|
files_response = []
|
|
349
374
|
while not done:
|
|
350
375
|
response: dict = files_client.list(
|
|
376
|
+
supportsAllDrives=True,
|
|
377
|
+
includeItemsFromAllDrives=True,
|
|
351
378
|
spaces="drive",
|
|
352
379
|
fields=fields_input,
|
|
353
380
|
corpora="user",
|
|
@@ -381,7 +408,9 @@ class GoogleDriveIndexer(Indexer):
|
|
|
381
408
|
return files_response
|
|
382
409
|
|
|
383
410
|
def get_root_info(self, files_client, object_id: str) -> dict:
|
|
384
|
-
return files_client.get(
|
|
411
|
+
return files_client.get(
|
|
412
|
+
supportsAllDrives=True, fileId=object_id, fields=",".join(self.fields)
|
|
413
|
+
).execute()
|
|
385
414
|
|
|
386
415
|
def get_files(
|
|
387
416
|
self,
|
|
@@ -392,7 +421,9 @@ class GoogleDriveIndexer(Indexer):
|
|
|
392
421
|
) -> list[FileData]:
|
|
393
422
|
root_info = self.get_root_info(files_client=files_client, object_id=object_id)
|
|
394
423
|
if not self.is_dir(root_info):
|
|
395
|
-
root_info["permissions"] = self.extract_permissions(
|
|
424
|
+
root_info["permissions"] = self.extract_permissions(
|
|
425
|
+
root_info.get("permissions")
|
|
426
|
+
)
|
|
396
427
|
data = [self.map_file_data(root_info)]
|
|
397
428
|
else:
|
|
398
429
|
file_contents = self.get_paginated_results(
|
|
@@ -413,7 +444,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
413
444
|
def extract_permissions(self, permissions: Optional[list[dict]]) -> list[dict]:
|
|
414
445
|
if not permissions:
|
|
415
446
|
logger.debug("no permissions found")
|
|
416
|
-
return {}
|
|
447
|
+
return [{}]
|
|
417
448
|
|
|
418
449
|
# https://developers.google.com/workspace/drive/api/guides/ref-roles
|
|
419
450
|
role_mapping = {
|
|
@@ -464,61 +495,136 @@ class GoogleDriveDownloaderConfig(DownloaderConfig):
|
|
|
464
495
|
|
|
465
496
|
@dataclass
|
|
466
497
|
class GoogleDriveDownloader(Downloader):
|
|
498
|
+
"""
|
|
499
|
+
Downloads files from Google Drive using authenticated direct HTTP requests
|
|
500
|
+
via `exportLinks` (for Google-native files) and `webContentLink` (for binary files).
|
|
501
|
+
|
|
502
|
+
These links emulate the behavior of Google Drive's "File > Download as..." options
|
|
503
|
+
in the UI and bypass the size limitations of `files.export()`.
|
|
504
|
+
|
|
505
|
+
Behavior:
|
|
506
|
+
- Google-native formats are downloaded using `exportLinks` in appropriate MIME formats.
|
|
507
|
+
- Binary files (non-Google-native) are downloaded using `webContentLink`.
|
|
508
|
+
- All downloads are performed via `requests.get()` using a valid bearer token.
|
|
509
|
+
"""
|
|
510
|
+
|
|
467
511
|
connection_config: GoogleDriveConnectionConfig
|
|
468
512
|
download_config: GoogleDriveDownloaderConfig = field(
|
|
469
513
|
default_factory=lambda: GoogleDriveDownloaderConfig()
|
|
470
514
|
)
|
|
471
515
|
connector_type: str = CONNECTOR_TYPE
|
|
472
516
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
return downloaded
|
|
517
|
+
def _get_download_url_and_ext(
|
|
518
|
+
self, file_id: str, mime_type: str
|
|
519
|
+
) -> tuple[str, str]:
|
|
520
|
+
"""
|
|
521
|
+
Resolves the appropriate download URL and expected file extension for a Google Drive file.
|
|
479
522
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
483
|
-
logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
|
|
484
|
-
with open(download_path, "wb") as handler:
|
|
485
|
-
handler.write(file_contents.getbuffer())
|
|
486
|
-
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
523
|
+
- Google-native files use export MIME types from exportLinks (e.g., .docx, .xlsx).
|
|
524
|
+
- Binary files use webContentLink (e.g., uploaded PDFs or ZIPs).
|
|
487
525
|
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
from googleapiclient.http import MediaIoBaseDownload
|
|
526
|
+
Returns:
|
|
527
|
+
Tuple[str, str]: (download URL, file extension or "")
|
|
491
528
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
if not mime_type:
|
|
496
|
-
raise TypeError(
|
|
497
|
-
f"File not supported. Name: {file_data.source_identifiers.filename} "
|
|
498
|
-
f"ID: {record_id} "
|
|
499
|
-
f"MimeType: {mime_type}"
|
|
500
|
-
)
|
|
529
|
+
Raises:
|
|
530
|
+
SourceConnectionError: If no valid export or download link is available.
|
|
531
|
+
"""
|
|
501
532
|
with self.connection_config.get_client() as client:
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
533
|
+
metadata = client.get(
|
|
534
|
+
fileId=file_id, fields="exportLinks,webContentLink"
|
|
535
|
+
).execute()
|
|
536
|
+
|
|
537
|
+
export_links = metadata.get("exportLinks", {})
|
|
538
|
+
web_link = metadata.get("webContentLink")
|
|
539
|
+
|
|
540
|
+
if export_mime := GOOGLE_EXPORT_MIME_MAP.get(mime_type):
|
|
541
|
+
url = export_links.get(export_mime)
|
|
542
|
+
if not url:
|
|
543
|
+
raise SourceConnectionError(
|
|
544
|
+
f"No export link found for {file_id} as {export_mime}"
|
|
508
545
|
)
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
546
|
+
ext = EXPORT_EXTENSION_MAP.get(export_mime, "")
|
|
547
|
+
return url, ext
|
|
548
|
+
|
|
549
|
+
if not web_link:
|
|
550
|
+
raise SourceConnectionError(
|
|
551
|
+
f"No webContentLink available for file {file_id}"
|
|
552
|
+
)
|
|
553
|
+
return web_link, ""
|
|
554
|
+
|
|
555
|
+
@requires_dependencies(["httpx", "google.auth"], extras="google-drive")
|
|
556
|
+
def _download_url(self, file_data: FileData, url: str, ext: str = "") -> Path:
|
|
557
|
+
"""
|
|
558
|
+
Streams file content directly to disk using authenticated HTTP request.
|
|
559
|
+
|
|
560
|
+
Writes the file to the correct path in the download directory while downloading.
|
|
561
|
+
Avoids buffering large files in memory.
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
Path to the downloaded file.
|
|
565
|
+
|
|
566
|
+
Raises:
|
|
567
|
+
SourceConnectionError: If the HTTP request fails.
|
|
568
|
+
"""
|
|
569
|
+
import httpx
|
|
570
|
+
from google.auth.transport.requests import Request
|
|
571
|
+
from google.oauth2 import service_account
|
|
572
|
+
|
|
573
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
574
|
+
key_data = access_config.get_service_account_key()
|
|
575
|
+
creds = service_account.Credentials.from_service_account_info(
|
|
576
|
+
key_data,
|
|
577
|
+
scopes=["https://www.googleapis.com/auth/drive.readonly"],
|
|
578
|
+
)
|
|
579
|
+
creds.refresh(Request())
|
|
580
|
+
|
|
581
|
+
headers = {
|
|
582
|
+
"Authorization": f"Bearer {creds.token}",
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
download_path = self.get_download_path(file_data)
|
|
586
|
+
if ext:
|
|
587
|
+
download_path = download_path.with_suffix(ext)
|
|
588
|
+
|
|
589
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
590
|
+
logger.debug(f"Streaming file to {download_path}")
|
|
591
|
+
|
|
592
|
+
with (
|
|
593
|
+
httpx.Client(timeout=None, follow_redirects=True) as client,
|
|
594
|
+
client.stream("GET", url, headers=headers) as response,
|
|
595
|
+
):
|
|
596
|
+
if response.status_code != 200:
|
|
597
|
+
raise SourceConnectionError(
|
|
598
|
+
f"Failed to stream download from {url}: {response.status_code}"
|
|
512
599
|
)
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
600
|
+
with open(download_path, "wb") as f:
|
|
601
|
+
for chunk in response.iter_bytes():
|
|
602
|
+
f.write(chunk)
|
|
603
|
+
|
|
604
|
+
return download_path
|
|
605
|
+
|
|
606
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
607
|
+
mime_type = file_data.additional_metadata.get("mimeType", "")
|
|
608
|
+
record_id = file_data.identifier
|
|
609
|
+
|
|
610
|
+
logger.debug(
|
|
611
|
+
f"Downloading file {file_data.source_identifiers.fullpath} of type {mime_type}"
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
download_url, ext = self._get_download_url_and_ext(record_id, mime_type)
|
|
615
|
+
download_path = self._download_url(file_data, download_url, ext)
|
|
616
|
+
|
|
617
|
+
file_data.additional_metadata.update(
|
|
618
|
+
{
|
|
619
|
+
"download_method": "export_link" if ext else "web_content_link",
|
|
620
|
+
"download_url_used": download_url,
|
|
621
|
+
}
|
|
622
|
+
)
|
|
623
|
+
file_data.local_download_path = str(download_path.resolve())
|
|
624
|
+
|
|
625
|
+
return self.generate_download_response(
|
|
626
|
+
file_data=file_data, download_path=download_path
|
|
627
|
+
)
|
|
522
628
|
|
|
523
629
|
|
|
524
630
|
google_drive_source_entry = SourceRegistryEntry(
|
|
@@ -69,8 +69,8 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
|
|
|
69
69
|
|
|
70
70
|
@property
|
|
71
71
|
def bearer_token(self) -> str:
|
|
72
|
-
# Add
|
|
73
|
-
timestamp = int(time.time()) + 60
|
|
72
|
+
# Add 5 minutes to deal with edge cases where the token expires before the request is made
|
|
73
|
+
timestamp = int(time.time()) + (60 * 5)
|
|
74
74
|
if self._bearer_token is None or self._bearer_token.get("expiration", 0) <= timestamp:
|
|
75
75
|
self._bearer_token = self.generate_bearer_token()
|
|
76
76
|
return self._bearer_token["access_token"]
|
|
@@ -172,7 +172,7 @@ class IbmWatsonxUploaderConfig(UploaderConfig):
|
|
|
172
172
|
namespace: str = Field(description="Namespace name")
|
|
173
173
|
table: str = Field(description="Table name")
|
|
174
174
|
max_retries: int = Field(
|
|
175
|
-
default=5, description="Maximum number of retries to upload data", ge=2, le=
|
|
175
|
+
default=5, description="Maximum number of retries to upload data", ge=2, le=500
|
|
176
176
|
)
|
|
177
177
|
record_id_key: str = Field(
|
|
178
178
|
default=RECORD_ID_LABEL,
|
|
@@ -240,7 +240,7 @@ class IbmWatsonxUploader(SQLUploader):
|
|
|
240
240
|
def upload_data_table(
|
|
241
241
|
self, table: "Table", data_table: "ArrowTable", file_data: FileData
|
|
242
242
|
) -> None:
|
|
243
|
-
from pyiceberg.exceptions import CommitFailedException
|
|
243
|
+
from pyiceberg.exceptions import CommitFailedException, RESTError
|
|
244
244
|
from tenacity import (
|
|
245
245
|
before_log,
|
|
246
246
|
retry,
|
|
@@ -265,21 +265,51 @@ class IbmWatsonxUploader(SQLUploader):
|
|
|
265
265
|
table.refresh()
|
|
266
266
|
logger.debug(e)
|
|
267
267
|
raise IcebergCommitFailedException(e)
|
|
268
|
+
except RESTError:
|
|
269
|
+
raise
|
|
268
270
|
except Exception as e:
|
|
269
271
|
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
270
272
|
|
|
271
273
|
try:
|
|
272
274
|
return _upload_data_table(table, data_table, file_data)
|
|
275
|
+
except RESTError:
|
|
276
|
+
raise
|
|
273
277
|
except ProviderError:
|
|
274
278
|
raise
|
|
275
279
|
except Exception as e:
|
|
276
280
|
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
277
281
|
|
|
282
|
+
@requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
|
|
278
283
|
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
284
|
+
from pyiceberg.exceptions import RESTError
|
|
285
|
+
from tenacity import (
|
|
286
|
+
before_log,
|
|
287
|
+
retry,
|
|
288
|
+
retry_if_exception_type,
|
|
289
|
+
stop_after_attempt,
|
|
290
|
+
wait_random,
|
|
291
|
+
)
|
|
292
|
+
|
|
279
293
|
data_table = self._df_to_arrow_table(df)
|
|
280
294
|
|
|
281
|
-
|
|
282
|
-
|
|
295
|
+
# Retry connection in case of connection error
|
|
296
|
+
@retry(
|
|
297
|
+
stop=stop_after_attempt(2),
|
|
298
|
+
wait=wait_random(),
|
|
299
|
+
retry=retry_if_exception_type(RESTError),
|
|
300
|
+
before=before_log(logger, logging.DEBUG),
|
|
301
|
+
reraise=True,
|
|
302
|
+
)
|
|
303
|
+
def _upload_dataframe(data_table: Any, file_data: FileData) -> None:
|
|
304
|
+
with self.get_table() as table:
|
|
305
|
+
self.upload_data_table(table, data_table, file_data)
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
return _upload_dataframe(data_table, file_data)
|
|
309
|
+
except ProviderError:
|
|
310
|
+
raise
|
|
311
|
+
except Exception as e:
|
|
312
|
+
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
283
313
|
|
|
284
314
|
@requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
|
|
285
315
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import re
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
4
5
|
from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
5
6
|
|
|
6
7
|
from pydantic import Field, Secret
|
|
@@ -18,11 +19,14 @@ from unstructured_ingest.interfaces import (
|
|
|
18
19
|
)
|
|
19
20
|
from unstructured_ingest.logger import logger
|
|
20
21
|
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
22
|
+
from unstructured_ingest.utils import ndjson
|
|
21
23
|
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
22
24
|
from unstructured_ingest.utils.data_prep import (
|
|
23
25
|
flatten_dict,
|
|
24
26
|
generator_batching_wbytes,
|
|
25
27
|
get_enhanced_element_id,
|
|
28
|
+
get_json_data,
|
|
29
|
+
write_data,
|
|
26
30
|
)
|
|
27
31
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
28
32
|
|
|
@@ -162,6 +166,28 @@ class PineconeUploadStager(UploadStager):
|
|
|
162
166
|
"metadata": metadata,
|
|
163
167
|
}
|
|
164
168
|
|
|
169
|
+
def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
170
|
+
with input_file.open() as in_f:
|
|
171
|
+
reader = ndjson.reader(in_f)
|
|
172
|
+
with output_file.open("w") as out_f:
|
|
173
|
+
writer = ndjson.writer(out_f)
|
|
174
|
+
for element in reader:
|
|
175
|
+
if "embeddings" not in element:
|
|
176
|
+
continue
|
|
177
|
+
conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
|
|
178
|
+
writer.write(row=conformed_element)
|
|
179
|
+
writer.f.flush()
|
|
180
|
+
|
|
181
|
+
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
182
|
+
elements_contents = get_json_data(path=input_file)
|
|
183
|
+
|
|
184
|
+
conformed_elements = [
|
|
185
|
+
self.conform_dict(element_dict=element, file_data=file_data)
|
|
186
|
+
for element in elements_contents
|
|
187
|
+
if "embeddings" in element
|
|
188
|
+
]
|
|
189
|
+
write_data(path=output_file, data=conformed_elements)
|
|
190
|
+
|
|
165
191
|
|
|
166
192
|
@dataclass
|
|
167
193
|
class PineconeUploader(VectorDBUploader):
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.0.8" # pragma: no cover
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/cmd.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/dest.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/importer.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/src.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/utils/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/utils/click.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/data_types/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/data_types/entities.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/data_types/file_data.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/azure_openai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/bedrock.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/huggingface.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/interfaces.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/mixedbreadai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/octoai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/openai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/togetherai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/vertexai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/voyageai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/connector.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/indexer.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/process.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/processor.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/interfaces.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/otel.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/chunk.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/embed.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/stage.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|