unstructured-ingest 1.0.11__tar.gz → 1.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/PKG-INFO +1 -1
- unstructured_ingest-1.0.13/unstructured_ingest/__version__.py +1 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/google_drive.py +155 -63
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +65 -7
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +1 -0
- unstructured_ingest-1.0.11/unstructured_ingest/__version__.py +0 -1
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/.gitignore +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/LICENSE.md +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/README.md +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/pyproject.toml +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/README.md +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/base/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/base/cmd.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/base/dest.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/base/importer.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/base/src.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/cli.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/cmds.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/utils/click.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/data_types/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/data_types/entities.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/data_types/file_data.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/azure_openai.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/bedrock.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/huggingface.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/interfaces.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/mixedbreadai.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/octoai.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/openai.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/togetherai.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/vertexai.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/voyageai.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/error.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/errors_v2.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/connector.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/downloader.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/indexer.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/process.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/processor.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/upload_stager.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/uploader.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/logger.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/main.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/otel.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/interfaces.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/otel.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/pipeline.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/steps/download.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/steps/embed.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/steps/filter.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/steps/index.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/steps/partition.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/steps/stage.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/steps/upload.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/chunker.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connector_registry.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/airtable.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/astradb.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/chroma.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/confluence.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/discord.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/github.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/jira.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/local.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/milvus.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/onedrive.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/outlook.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/pinecone.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/sharepoint.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/slack.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/utils.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/vectara.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/embedder.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/filter.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/partitioner.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/uncompress.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/unstructured_api.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/chunking.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/compression.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/constants.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/data_prep.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/dep_check.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/html.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/ndjson.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/pydantic_models.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
- {unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/utils/table.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.13" # pragma: no cover
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import io
|
|
2
1
|
import json
|
|
3
2
|
from contextlib import contextmanager
|
|
4
3
|
from dataclasses import dataclass, field
|
|
@@ -16,7 +15,6 @@ from unstructured_ingest.data_types.file_data import (
|
|
|
16
15
|
)
|
|
17
16
|
from unstructured_ingest.error import (
|
|
18
17
|
SourceConnectionError,
|
|
19
|
-
SourceConnectionNetworkError,
|
|
20
18
|
)
|
|
21
19
|
from unstructured_ingest.interfaces import (
|
|
22
20
|
AccessConfig,
|
|
@@ -34,25 +32,34 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
34
32
|
|
|
35
33
|
if TYPE_CHECKING:
|
|
36
34
|
from googleapiclient.discovery import Resource as GoogleAPIResource
|
|
37
|
-
from googleapiclient.http import MediaIoBaseDownload
|
|
38
35
|
|
|
39
36
|
CONNECTOR_TYPE = "google_drive"
|
|
40
37
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
"application/vnd.google-apps.
|
|
45
|
-
|
|
46
|
-
"application/vnd.google-apps.
|
|
47
|
-
|
|
48
|
-
"application/vnd.google-apps.
|
|
38
|
+
|
|
39
|
+
# Maps Google-native Drive MIME types → export MIME types
|
|
40
|
+
GOOGLE_EXPORT_MIME_MAP = {
|
|
41
|
+
"application/vnd.google-apps.document": \
|
|
42
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
43
|
+
"application/vnd.google-apps.spreadsheet": \
|
|
44
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
45
|
+
"application/vnd.google-apps.presentation": \
|
|
46
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Maps export MIME types → file extensions
|
|
50
|
+
EXPORT_EXTENSION_MAP = {
|
|
51
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
52
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
53
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
54
|
+
"application/pdf": ".pdf",
|
|
55
|
+
"text/html": ".html",
|
|
49
56
|
}
|
|
50
57
|
|
|
51
58
|
|
|
52
59
|
class GoogleDriveAccessConfig(AccessConfig):
|
|
53
|
-
service_account_key: Optional[
|
|
54
|
-
|
|
55
|
-
)
|
|
60
|
+
service_account_key: Optional[
|
|
61
|
+
Annotated[dict, BeforeValidator(conform_string_to_dict)]
|
|
62
|
+
] = Field(default=None, description="Credentials values to use for authentication")
|
|
56
63
|
service_account_key_path: Optional[Path] = Field(
|
|
57
64
|
default=None,
|
|
58
65
|
description="File path to credentials values to use for authentication",
|
|
@@ -171,10 +178,14 @@ class GoogleDriveIndexer(Indexer):
|
|
|
171
178
|
Please enable it in the Google Cloud Console."
|
|
172
179
|
)
|
|
173
180
|
else:
|
|
174
|
-
raise SourceConnectionError(
|
|
181
|
+
raise SourceConnectionError(
|
|
182
|
+
"Google drive API unreachable for an unknown reason!"
|
|
183
|
+
)
|
|
175
184
|
|
|
176
185
|
@staticmethod
|
|
177
|
-
def count_files_recursively(
|
|
186
|
+
def count_files_recursively(
|
|
187
|
+
files_client, folder_id: str, extensions: list[str] = None
|
|
188
|
+
) -> int:
|
|
178
189
|
"""
|
|
179
190
|
Count non-folder files recursively under the given folder.
|
|
180
191
|
If `extensions` is provided, only count files
|
|
@@ -255,7 +266,9 @@ class GoogleDriveIndexer(Indexer):
|
|
|
255
266
|
# that the service account has proper permissions."
|
|
256
267
|
# )
|
|
257
268
|
else:
|
|
258
|
-
logger.info(
|
|
269
|
+
logger.info(
|
|
270
|
+
f"Found {file_count} files recursively in the folder."
|
|
271
|
+
)
|
|
259
272
|
else:
|
|
260
273
|
# Non-recursive: check for at least one immediate non-folder child.
|
|
261
274
|
response = client.list(
|
|
@@ -306,7 +319,9 @@ class GoogleDriveIndexer(Indexer):
|
|
|
306
319
|
date_modified_str = root_info.pop("modifiedTime", None)
|
|
307
320
|
parent_path = root_info.pop("parent_path", None)
|
|
308
321
|
parent_root_path = root_info.pop("parent_root_path", None)
|
|
309
|
-
date_modified_dt =
|
|
322
|
+
date_modified_dt = (
|
|
323
|
+
parser.parse(date_modified_str) if date_modified_str else None
|
|
324
|
+
)
|
|
310
325
|
if (
|
|
311
326
|
parent_path
|
|
312
327
|
and isinstance(parent_path, str)
|
|
@@ -406,7 +421,9 @@ class GoogleDriveIndexer(Indexer):
|
|
|
406
421
|
) -> list[FileData]:
|
|
407
422
|
root_info = self.get_root_info(files_client=files_client, object_id=object_id)
|
|
408
423
|
if not self.is_dir(root_info):
|
|
409
|
-
root_info["permissions"] = self.extract_permissions(
|
|
424
|
+
root_info["permissions"] = self.extract_permissions(
|
|
425
|
+
root_info.get("permissions")
|
|
426
|
+
)
|
|
410
427
|
data = [self.map_file_data(root_info)]
|
|
411
428
|
else:
|
|
412
429
|
file_contents = self.get_paginated_results(
|
|
@@ -427,7 +444,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
427
444
|
def extract_permissions(self, permissions: Optional[list[dict]]) -> list[dict]:
|
|
428
445
|
if not permissions:
|
|
429
446
|
logger.debug("no permissions found")
|
|
430
|
-
return {}
|
|
447
|
+
return [{}]
|
|
431
448
|
|
|
432
449
|
# https://developers.google.com/workspace/drive/api/guides/ref-roles
|
|
433
450
|
role_mapping = {
|
|
@@ -478,61 +495,136 @@ class GoogleDriveDownloaderConfig(DownloaderConfig):
|
|
|
478
495
|
|
|
479
496
|
@dataclass
|
|
480
497
|
class GoogleDriveDownloader(Downloader):
|
|
498
|
+
"""
|
|
499
|
+
Downloads files from Google Drive using authenticated direct HTTP requests
|
|
500
|
+
via `exportLinks` (for Google-native files) and `webContentLink` (for binary files).
|
|
501
|
+
|
|
502
|
+
These links emulate the behavior of Google Drive's "File > Download as..." options
|
|
503
|
+
in the UI and bypass the size limitations of `files.export()`.
|
|
504
|
+
|
|
505
|
+
Behavior:
|
|
506
|
+
- Google-native formats are downloaded using `exportLinks` in appropriate MIME formats.
|
|
507
|
+
- Binary files (non-Google-native) are downloaded using `webContentLink`.
|
|
508
|
+
- All downloads are performed via `requests.get()` using a valid bearer token.
|
|
509
|
+
"""
|
|
510
|
+
|
|
481
511
|
connection_config: GoogleDriveConnectionConfig
|
|
482
512
|
download_config: GoogleDriveDownloaderConfig = field(
|
|
483
513
|
default_factory=lambda: GoogleDriveDownloaderConfig()
|
|
484
514
|
)
|
|
485
515
|
connector_type: str = CONNECTOR_TYPE
|
|
486
516
|
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
return downloaded
|
|
517
|
+
def _get_download_url_and_ext(
|
|
518
|
+
self, file_id: str, mime_type: str
|
|
519
|
+
) -> tuple[str, str]:
|
|
520
|
+
"""
|
|
521
|
+
Resolves the appropriate download URL and expected file extension for a Google Drive file.
|
|
493
522
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
497
|
-
logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
|
|
498
|
-
with open(download_path, "wb") as handler:
|
|
499
|
-
handler.write(file_contents.getbuffer())
|
|
500
|
-
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
523
|
+
- Google-native files use export MIME types from exportLinks (e.g., .docx, .xlsx).
|
|
524
|
+
- Binary files use webContentLink (e.g., uploaded PDFs or ZIPs).
|
|
501
525
|
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
from googleapiclient.http import MediaIoBaseDownload
|
|
526
|
+
Returns:
|
|
527
|
+
Tuple[str, str]: (download URL, file extension or "")
|
|
505
528
|
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
if not mime_type:
|
|
510
|
-
raise TypeError(
|
|
511
|
-
f"File not supported. Name: {file_data.source_identifiers.filename} "
|
|
512
|
-
f"ID: {record_id} "
|
|
513
|
-
f"MimeType: {mime_type}"
|
|
514
|
-
)
|
|
529
|
+
Raises:
|
|
530
|
+
SourceConnectionError: If no valid export or download link is available.
|
|
531
|
+
"""
|
|
515
532
|
with self.connection_config.get_client() as client:
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
533
|
+
metadata = client.get(
|
|
534
|
+
fileId=file_id, fields="exportLinks,webContentLink"
|
|
535
|
+
).execute()
|
|
536
|
+
|
|
537
|
+
export_links = metadata.get("exportLinks", {})
|
|
538
|
+
web_link = metadata.get("webContentLink")
|
|
539
|
+
|
|
540
|
+
if export_mime := GOOGLE_EXPORT_MIME_MAP.get(mime_type):
|
|
541
|
+
url = export_links.get(export_mime)
|
|
542
|
+
if not url:
|
|
543
|
+
raise SourceConnectionError(
|
|
544
|
+
f"No export link found for {file_id} as {export_mime}"
|
|
522
545
|
)
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
546
|
+
ext = EXPORT_EXTENSION_MAP.get(export_mime, "")
|
|
547
|
+
return url, ext
|
|
548
|
+
|
|
549
|
+
if not web_link:
|
|
550
|
+
raise SourceConnectionError(
|
|
551
|
+
f"No webContentLink available for file {file_id}"
|
|
552
|
+
)
|
|
553
|
+
return web_link, ""
|
|
554
|
+
|
|
555
|
+
@requires_dependencies(["httpx", "google.auth"], extras="google-drive")
|
|
556
|
+
def _download_url(self, file_data: FileData, url: str, ext: str = "") -> Path:
|
|
557
|
+
"""
|
|
558
|
+
Streams file content directly to disk using authenticated HTTP request.
|
|
559
|
+
|
|
560
|
+
Writes the file to the correct path in the download directory while downloading.
|
|
561
|
+
Avoids buffering large files in memory.
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
Path to the downloaded file.
|
|
565
|
+
|
|
566
|
+
Raises:
|
|
567
|
+
SourceConnectionError: If the HTTP request fails.
|
|
568
|
+
"""
|
|
569
|
+
import httpx
|
|
570
|
+
from google.auth.transport.requests import Request
|
|
571
|
+
from google.oauth2 import service_account
|
|
572
|
+
|
|
573
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
574
|
+
key_data = access_config.get_service_account_key()
|
|
575
|
+
creds = service_account.Credentials.from_service_account_info(
|
|
576
|
+
key_data,
|
|
577
|
+
scopes=["https://www.googleapis.com/auth/drive.readonly"],
|
|
578
|
+
)
|
|
579
|
+
creds.refresh(Request())
|
|
580
|
+
|
|
581
|
+
headers = {
|
|
582
|
+
"Authorization": f"Bearer {creds.token}",
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
download_path = self.get_download_path(file_data)
|
|
586
|
+
if ext:
|
|
587
|
+
download_path = download_path.with_suffix(ext)
|
|
588
|
+
|
|
589
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
590
|
+
logger.debug(f"Streaming file to {download_path}")
|
|
591
|
+
|
|
592
|
+
with (
|
|
593
|
+
httpx.Client(timeout=None, follow_redirects=True) as client,
|
|
594
|
+
client.stream("GET", url, headers=headers) as response,
|
|
595
|
+
):
|
|
596
|
+
if response.status_code != 200:
|
|
597
|
+
raise SourceConnectionError(
|
|
598
|
+
f"Failed to stream download from {url}: {response.status_code}"
|
|
526
599
|
)
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
600
|
+
with open(download_path, "wb") as f:
|
|
601
|
+
for chunk in response.iter_bytes():
|
|
602
|
+
f.write(chunk)
|
|
603
|
+
|
|
604
|
+
return download_path
|
|
605
|
+
|
|
606
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
607
|
+
mime_type = file_data.additional_metadata.get("mimeType", "")
|
|
608
|
+
record_id = file_data.identifier
|
|
609
|
+
|
|
610
|
+
logger.debug(
|
|
611
|
+
f"Downloading file {file_data.source_identifiers.fullpath} of type {mime_type}"
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
download_url, ext = self._get_download_url_and_ext(record_id, mime_type)
|
|
615
|
+
download_path = self._download_url(file_data, download_url, ext)
|
|
616
|
+
|
|
617
|
+
file_data.additional_metadata.update(
|
|
618
|
+
{
|
|
619
|
+
"download_method": "export_link" if ext else "web_content_link",
|
|
620
|
+
"download_url_used": download_url,
|
|
621
|
+
}
|
|
622
|
+
)
|
|
623
|
+
file_data.local_download_path = str(download_path.resolve())
|
|
624
|
+
|
|
625
|
+
return self.generate_download_response(
|
|
626
|
+
file_data=file_data, download_path=download_path
|
|
627
|
+
)
|
|
536
628
|
|
|
537
629
|
|
|
538
630
|
google_drive_source_entry = SourceRegistryEntry(
|
|
@@ -56,6 +56,12 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
|
|
|
56
56
|
object_storage_endpoint: str = Field(description="Cloud Object Storage public endpoint")
|
|
57
57
|
object_storage_region: str = Field(description="Cloud Object Storage region")
|
|
58
58
|
catalog: str = Field(description="Catalog name")
|
|
59
|
+
max_retries_connection: int = Field(
|
|
60
|
+
default=10,
|
|
61
|
+
description="Maximum number of retries in case of a connection error (RESTError)",
|
|
62
|
+
ge=2,
|
|
63
|
+
le=100,
|
|
64
|
+
)
|
|
59
65
|
|
|
60
66
|
_bearer_token: Optional[dict[str, Any]] = None
|
|
61
67
|
|
|
@@ -69,8 +75,8 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
|
|
|
69
75
|
|
|
70
76
|
@property
|
|
71
77
|
def bearer_token(self) -> str:
|
|
72
|
-
# Add
|
|
73
|
-
timestamp = int(time.time()) + 60
|
|
78
|
+
# Add 5 minutes to deal with edge cases where the token expires before the request is made
|
|
79
|
+
timestamp = int(time.time()) + (60 * 5)
|
|
74
80
|
if self._bearer_token is None or self._bearer_token.get("expiration", 0) <= timestamp:
|
|
75
81
|
self._bearer_token = self.generate_bearer_token()
|
|
76
82
|
return self._bearer_token["access_token"]
|
|
@@ -145,10 +151,29 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
|
|
|
145
151
|
@contextmanager
|
|
146
152
|
def get_catalog(self) -> Generator["RestCatalog", None, None]:
|
|
147
153
|
from pyiceberg.catalog import load_catalog
|
|
154
|
+
from pyiceberg.exceptions import RESTError
|
|
155
|
+
from tenacity import (
|
|
156
|
+
before_log,
|
|
157
|
+
retry,
|
|
158
|
+
retry_if_exception_type,
|
|
159
|
+
stop_after_attempt,
|
|
160
|
+
wait_exponential,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Retry connection in case of a connection error
|
|
164
|
+
@retry(
|
|
165
|
+
stop=stop_after_attempt(self.max_retries_connection),
|
|
166
|
+
wait=wait_exponential(exp_base=2, multiplier=1, min=2, max=10),
|
|
167
|
+
retry=retry_if_exception_type(RESTError),
|
|
168
|
+
before=before_log(logger, logging.DEBUG),
|
|
169
|
+
reraise=True,
|
|
170
|
+
)
|
|
171
|
+
def _get_catalog(catalog_config: dict[str, Any]) -> "RestCatalog":
|
|
172
|
+
return load_catalog(**catalog_config)
|
|
148
173
|
|
|
149
174
|
try:
|
|
150
175
|
catalog_config = self.get_catalog_config()
|
|
151
|
-
catalog =
|
|
176
|
+
catalog = _get_catalog(catalog_config)
|
|
152
177
|
except Exception as e:
|
|
153
178
|
logger.error(f"Failed to connect to catalog '{self.catalog}': {e}", exc_info=True)
|
|
154
179
|
raise ProviderError(f"Failed to connect to catalog '{self.catalog}': {e}")
|
|
@@ -172,7 +197,10 @@ class IbmWatsonxUploaderConfig(UploaderConfig):
|
|
|
172
197
|
namespace: str = Field(description="Namespace name")
|
|
173
198
|
table: str = Field(description="Table name")
|
|
174
199
|
max_retries: int = Field(
|
|
175
|
-
default=
|
|
200
|
+
default=50,
|
|
201
|
+
description="Maximum number of retries to upload data (CommitFailedException)",
|
|
202
|
+
ge=2,
|
|
203
|
+
le=500,
|
|
176
204
|
)
|
|
177
205
|
record_id_key: str = Field(
|
|
178
206
|
default=RECORD_ID_LABEL,
|
|
@@ -240,7 +268,7 @@ class IbmWatsonxUploader(SQLUploader):
|
|
|
240
268
|
def upload_data_table(
|
|
241
269
|
self, table: "Table", data_table: "ArrowTable", file_data: FileData
|
|
242
270
|
) -> None:
|
|
243
|
-
from pyiceberg.exceptions import CommitFailedException
|
|
271
|
+
from pyiceberg.exceptions import CommitFailedException, RESTError
|
|
244
272
|
from tenacity import (
|
|
245
273
|
before_log,
|
|
246
274
|
retry,
|
|
@@ -265,21 +293,51 @@ class IbmWatsonxUploader(SQLUploader):
|
|
|
265
293
|
table.refresh()
|
|
266
294
|
logger.debug(e)
|
|
267
295
|
raise IcebergCommitFailedException(e)
|
|
296
|
+
except RESTError:
|
|
297
|
+
raise
|
|
268
298
|
except Exception as e:
|
|
269
299
|
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
270
300
|
|
|
271
301
|
try:
|
|
272
302
|
return _upload_data_table(table, data_table, file_data)
|
|
303
|
+
except RESTError:
|
|
304
|
+
raise
|
|
273
305
|
except ProviderError:
|
|
274
306
|
raise
|
|
275
307
|
except Exception as e:
|
|
276
308
|
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
277
309
|
|
|
310
|
+
@requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
|
|
278
311
|
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
312
|
+
from pyiceberg.exceptions import RESTError
|
|
313
|
+
from tenacity import (
|
|
314
|
+
before_log,
|
|
315
|
+
retry,
|
|
316
|
+
retry_if_exception_type,
|
|
317
|
+
stop_after_attempt,
|
|
318
|
+
wait_exponential,
|
|
319
|
+
)
|
|
320
|
+
|
|
279
321
|
data_table = self._df_to_arrow_table(df)
|
|
280
322
|
|
|
281
|
-
|
|
282
|
-
|
|
323
|
+
# Retry connection in case of a connection error or token expiration
|
|
324
|
+
@retry(
|
|
325
|
+
stop=stop_after_attempt(self.connection_config.max_retries_connection),
|
|
326
|
+
wait=wait_exponential(exp_base=2, multiplier=1, min=2, max=10),
|
|
327
|
+
retry=retry_if_exception_type(RESTError),
|
|
328
|
+
before=before_log(logger, logging.DEBUG),
|
|
329
|
+
reraise=True,
|
|
330
|
+
)
|
|
331
|
+
def _upload_dataframe(data_table: Any, file_data: FileData) -> None:
|
|
332
|
+
with self.get_table() as table:
|
|
333
|
+
self.upload_data_table(table, data_table, file_data)
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
return _upload_dataframe(data_table, file_data)
|
|
337
|
+
except ProviderError:
|
|
338
|
+
raise
|
|
339
|
+
except Exception as e:
|
|
340
|
+
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
283
341
|
|
|
284
342
|
@requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
|
|
285
343
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.0.11" # pragma: no cover
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/base/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/base/cmd.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/base/dest.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/base/importer.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/base/src.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/utils/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/cli/utils/click.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/data_types/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/data_types/entities.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/azure_openai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/bedrock.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/huggingface.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/interfaces.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/mixedbreadai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/octoai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/openai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/togetherai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/vertexai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/embed/voyageai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/indexer.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/process.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/interfaces/uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/interfaces.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/otel.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/pipeline/pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.11 → unstructured_ingest-1.0.13}/unstructured_ingest/processes/chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|