unstructured-ingest 1.0.2__tar.gz → 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/PKG-INFO +1 -1
- unstructured_ingest-1.0.5/unstructured_ingest/__version__.py +1 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/data_types/file_data.py +24 -2
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/confluence.py +183 -16
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/google_drive.py +42 -1
- unstructured_ingest-1.0.2/unstructured_ingest/__version__.py +0 -1
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/.gitignore +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/LICENSE.md +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/README.md +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/pyproject.toml +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/README.md +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/cmd.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/dest.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/importer.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/src.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/cli.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/cmds.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/utils/click.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/data_types/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/azure_openai.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/bedrock.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/huggingface.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/interfaces.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/mixedbreadai.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/octoai.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/openai.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/togetherai.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/vertexai.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/voyageai.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/error.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/errors_v2.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/connector.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/downloader.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/indexer.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/process.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/processor.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/upload_stager.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/uploader.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/logger.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/main.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/otel.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/interfaces.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/otel.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/pipeline.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/download.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/embed.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/filter.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/index.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/partition.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/stage.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/upload.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/chunker.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connector_registry.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/airtable.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/astradb.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/chroma.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/discord.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/github.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/jira.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/local.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/milvus.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/onedrive.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/outlook.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/pinecone.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sharepoint.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/slack.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/utils.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/vectara.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/embedder.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/filter.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/partitioner.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/uncompress.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/unstructured_api.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/chunking.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/compression.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/constants.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/data_prep.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/dep_check.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/html.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/ndjson.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/pydantic_models.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
- {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/table.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.5" # pragma: no cover
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/data_types/file_data.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Any, Optional
|
|
3
|
+
from typing import Any, Optional, Union
|
|
4
4
|
from uuid import NAMESPACE_DNS, uuid5
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
|
@@ -29,9 +29,31 @@ class FileDataSourceMetadata(BaseModel):
|
|
|
29
29
|
date_created: Optional[str] = None
|
|
30
30
|
date_modified: Optional[str] = None
|
|
31
31
|
date_processed: Optional[str] = None
|
|
32
|
-
permissions_data:
|
|
32
|
+
permissions_data: Union[list[dict[str, Any]], dict[str, Any], None] = None
|
|
33
33
|
filesize_bytes: Optional[int] = None
|
|
34
34
|
|
|
35
|
+
@field_validator("permissions_data", mode="before")
|
|
36
|
+
@classmethod
|
|
37
|
+
def coerce_permissions_data(cls, v: Any) -> Any:
|
|
38
|
+
if isinstance(v, dict):
|
|
39
|
+
# Temporarily convert dict to list for validation
|
|
40
|
+
return [v]
|
|
41
|
+
return v
|
|
42
|
+
|
|
43
|
+
@field_validator("permissions_data", mode="after")
|
|
44
|
+
@classmethod
|
|
45
|
+
def restore_dict_permissions_data(
|
|
46
|
+
cls, v: Optional[list[dict[str, Any]]]
|
|
47
|
+
) -> Union[list[dict[str, Any]], dict[str, Any], None]:
|
|
48
|
+
if (
|
|
49
|
+
isinstance(v, list)
|
|
50
|
+
and len(v) == 1
|
|
51
|
+
and isinstance(v[0], dict)
|
|
52
|
+
and any(isinstance(val, dict) for val in v[0].values())
|
|
53
|
+
):
|
|
54
|
+
return v[0]
|
|
55
|
+
return v
|
|
56
|
+
|
|
35
57
|
|
|
36
58
|
class FileData(BaseModel):
|
|
37
59
|
identifier: str
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
1
2
|
from contextlib import contextmanager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from pathlib import Path
|
|
4
|
-
from typing import TYPE_CHECKING, Generator, List, Optional
|
|
5
|
+
from typing import TYPE_CHECKING, Generator, List, Optional, Tuple
|
|
5
6
|
|
|
6
7
|
from pydantic import Field, Secret
|
|
7
8
|
|
|
@@ -135,35 +136,46 @@ class ConfluenceIndexer(Indexer):
|
|
|
135
136
|
logger.error(f"Failed to connect to Confluence: {e}", exc_info=True)
|
|
136
137
|
raise SourceConnectionError(f"Failed to connect to Confluence: {e}")
|
|
137
138
|
|
|
138
|
-
def
|
|
139
|
+
def _get_space_ids_and_keys(self) -> List[Tuple[str, int]]:
|
|
140
|
+
"""
|
|
141
|
+
Get a list of space IDs and keys from Confluence.
|
|
142
|
+
|
|
143
|
+
Example space ID (numerical): 98503
|
|
144
|
+
Example space key (str): "SD"
|
|
145
|
+
"""
|
|
139
146
|
spaces = self.index_config.spaces
|
|
140
147
|
if spaces:
|
|
141
|
-
|
|
148
|
+
with self.connection_config.get_client() as client:
|
|
149
|
+
space_ids_and_keys = []
|
|
150
|
+
for space_key in spaces:
|
|
151
|
+
space = client.get_space(space_key)
|
|
152
|
+
space_ids_and_keys.append((space_key, space["id"]))
|
|
153
|
+
return space_ids_and_keys
|
|
142
154
|
else:
|
|
143
155
|
with self.connection_config.get_client() as client:
|
|
144
156
|
all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
|
|
145
|
-
|
|
146
|
-
return
|
|
157
|
+
space_ids_and_keys = [(space["key"], space["id"]) for space in all_spaces["results"]]
|
|
158
|
+
return space_ids_and_keys
|
|
147
159
|
|
|
148
|
-
def _get_docs_ids_within_one_space(self,
|
|
160
|
+
def _get_docs_ids_within_one_space(self, space_key: str) -> List[dict]:
|
|
149
161
|
with self.connection_config.get_client() as client:
|
|
150
162
|
pages = client.get_all_pages_from_space(
|
|
151
|
-
space=
|
|
163
|
+
space=space_key,
|
|
152
164
|
start=0,
|
|
153
165
|
limit=self.index_config.max_num_of_docs_from_each_space,
|
|
154
166
|
expand=None,
|
|
155
|
-
content_type="page",
|
|
167
|
+
content_type="page", # blogpost and comment types not currently supported
|
|
156
168
|
status=None,
|
|
157
169
|
)
|
|
158
|
-
doc_ids = [{"space_id":
|
|
170
|
+
doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in pages]
|
|
159
171
|
return doc_ids
|
|
160
172
|
|
|
161
173
|
def run(self) -> Generator[FileData, None, None]:
|
|
162
174
|
from time import time
|
|
163
175
|
|
|
164
|
-
|
|
165
|
-
for space_id in
|
|
166
|
-
doc_ids = self._get_docs_ids_within_one_space(
|
|
176
|
+
space_ids_and_keys = self._get_space_ids_and_keys()
|
|
177
|
+
for space_key, space_id in space_ids_and_keys:
|
|
178
|
+
doc_ids = self._get_docs_ids_within_one_space(space_key)
|
|
167
179
|
for doc in doc_ids:
|
|
168
180
|
doc_id = doc["doc_id"]
|
|
169
181
|
# Build metadata
|
|
@@ -171,18 +183,19 @@ class ConfluenceIndexer(Indexer):
|
|
|
171
183
|
date_processed=str(time()),
|
|
172
184
|
url=f"{self.connection_config.url}/pages/{doc_id}",
|
|
173
185
|
record_locator={
|
|
174
|
-
"space_id":
|
|
186
|
+
"space_id": space_key,
|
|
175
187
|
"document_id": doc_id,
|
|
176
188
|
},
|
|
177
189
|
)
|
|
178
190
|
additional_metadata = {
|
|
179
|
-
"
|
|
191
|
+
"space_key": space_key,
|
|
192
|
+
"space_id": space_id, # diff from record_locator space_id (which is space_key)
|
|
180
193
|
"document_id": doc_id,
|
|
181
194
|
}
|
|
182
195
|
|
|
183
196
|
# Construct relative path and filename
|
|
184
197
|
filename = f"{doc_id}.html"
|
|
185
|
-
relative_path = str(Path(
|
|
198
|
+
relative_path = str(Path(space_key) / filename)
|
|
186
199
|
|
|
187
200
|
source_identifiers = SourceIdentifiers(
|
|
188
201
|
filename=filename,
|
|
@@ -201,7 +214,9 @@ class ConfluenceIndexer(Indexer):
|
|
|
201
214
|
|
|
202
215
|
|
|
203
216
|
class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
|
|
204
|
-
|
|
217
|
+
max_num_metadata_permissions: int = Field(
|
|
218
|
+
250, description="Approximate maximum number of permissions included in metadata"
|
|
219
|
+
)
|
|
205
220
|
|
|
206
221
|
|
|
207
222
|
@dataclass
|
|
@@ -209,6 +224,8 @@ class ConfluenceDownloader(Downloader):
|
|
|
209
224
|
connection_config: ConfluenceConnectionConfig
|
|
210
225
|
download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
|
|
211
226
|
connector_type: str = CONNECTOR_TYPE
|
|
227
|
+
_permissions_cache: dict = field(default_factory=OrderedDict)
|
|
228
|
+
_permissions_cache_max_size: int = 5
|
|
212
229
|
|
|
213
230
|
def download_embedded_files(
|
|
214
231
|
self, session, html: str, current_file_data: FileData
|
|
@@ -233,6 +250,148 @@ class ConfluenceDownloader(Downloader):
|
|
|
233
250
|
session=session,
|
|
234
251
|
)
|
|
235
252
|
|
|
253
|
+
def parse_permissions(self, doc_permissions: dict, space_permissions: list) -> dict[str, dict]:
|
|
254
|
+
"""
|
|
255
|
+
Parses document and space permissions to determine final user/group roles.
|
|
256
|
+
|
|
257
|
+
:param doc_permissions: dict containing document-level restrictions
|
|
258
|
+
- doc_permissions type in Confluence: ContentRestrictionArray
|
|
259
|
+
:param space_permissions: list of space-level permission assignments
|
|
260
|
+
- space_permissions type in Confluence: list of SpacePermissionAssignment
|
|
261
|
+
:return: dict with operation as keys and each maps to dict with "users" and "groups"
|
|
262
|
+
|
|
263
|
+
Get document permissions. If they exist, they will override space level permissions.
|
|
264
|
+
Otherwise, apply relevant space permissions (read, administer, delete)
|
|
265
|
+
"""
|
|
266
|
+
|
|
267
|
+
# Separate flags to track if view or edit is restricted at the page level
|
|
268
|
+
page_view_restricted = bool(
|
|
269
|
+
doc_permissions.get("read", {}).get("restrictions", {}).get("user", {}).get("results")
|
|
270
|
+
or doc_permissions.get("read", {})
|
|
271
|
+
.get("restrictions", {})
|
|
272
|
+
.get("group", {})
|
|
273
|
+
.get("results")
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
page_edit_restricted = bool(
|
|
277
|
+
doc_permissions.get("update", {}).get("restrictions", {}).get("user", {}).get("results")
|
|
278
|
+
or doc_permissions.get("update", {})
|
|
279
|
+
.get("restrictions", {})
|
|
280
|
+
.get("group", {})
|
|
281
|
+
.get("results")
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
permissions_by_role = {
|
|
285
|
+
"read": {"users": set(), "groups": set()},
|
|
286
|
+
"update": {"users": set(), "groups": set()},
|
|
287
|
+
"delete": {"users": set(), "groups": set()},
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
total_permissions = 0
|
|
291
|
+
|
|
292
|
+
for action, permissions in doc_permissions.items():
|
|
293
|
+
restrictions_dict = permissions.get("restrictions", {})
|
|
294
|
+
|
|
295
|
+
for entity_type, entity_data in restrictions_dict.items():
|
|
296
|
+
for entity in entity_data.get("results"):
|
|
297
|
+
entity_id = entity["accountId"] if entity_type == "user" else entity["id"]
|
|
298
|
+
permissions_by_role[action][f"{entity_type}s"].add(entity_id)
|
|
299
|
+
total_permissions += 1
|
|
300
|
+
# edit permission implies view permission
|
|
301
|
+
if action == "update":
|
|
302
|
+
permissions_by_role["read"][f"{entity_type}s"].add(entity_id)
|
|
303
|
+
# total_permissions += 1
|
|
304
|
+
# ^ omitting to not double count an entity.
|
|
305
|
+
# may result in a higher total count than max_num_metadata_permissions
|
|
306
|
+
|
|
307
|
+
for space_perm in space_permissions:
|
|
308
|
+
if total_permissions < self.download_config.max_num_metadata_permissions:
|
|
309
|
+
space_operation = space_perm["operation"]["key"]
|
|
310
|
+
space_target_type = space_perm["operation"]["targetType"]
|
|
311
|
+
space_entity_id = space_perm["principal"]["id"]
|
|
312
|
+
space_entity_type = space_perm["principal"]["type"]
|
|
313
|
+
|
|
314
|
+
# Apply space-level view permissions if no page restrictions exist
|
|
315
|
+
if (
|
|
316
|
+
space_target_type == "space"
|
|
317
|
+
and space_operation == "read"
|
|
318
|
+
and not page_view_restricted
|
|
319
|
+
):
|
|
320
|
+
permissions_by_role["read"][f"{space_entity_type}s"].add(space_entity_id)
|
|
321
|
+
total_permissions += 1
|
|
322
|
+
|
|
323
|
+
# Administer permission includes view + edit. Apply if not page restricted
|
|
324
|
+
elif space_target_type == "space" and space_operation == "administer":
|
|
325
|
+
if not page_view_restricted:
|
|
326
|
+
permissions_by_role["read"][f"{space_entity_type}s"].add(space_entity_id)
|
|
327
|
+
total_permissions += 1
|
|
328
|
+
if not page_edit_restricted:
|
|
329
|
+
permissions_by_role["update"][f"{space_entity_type}s"].add(
|
|
330
|
+
space_entity_id
|
|
331
|
+
)
|
|
332
|
+
# total_permissions += 1
|
|
333
|
+
# ^ omitting to not double count an entity.
|
|
334
|
+
# may result in a higher total count than max_num_metadata_permissions
|
|
335
|
+
|
|
336
|
+
# Add the "delete page" space permissions if there are other page permissions
|
|
337
|
+
elif (
|
|
338
|
+
space_target_type == "page"
|
|
339
|
+
and space_operation == "delete"
|
|
340
|
+
and space_entity_id in permissions_by_role["read"][f"{space_entity_type}s"]
|
|
341
|
+
):
|
|
342
|
+
permissions_by_role["delete"][f"{space_entity_type}s"].add(space_entity_id)
|
|
343
|
+
total_permissions += 1
|
|
344
|
+
|
|
345
|
+
# turn sets into sorted lists for consistency and json serialization
|
|
346
|
+
for role_dict in permissions_by_role.values():
|
|
347
|
+
for key in role_dict:
|
|
348
|
+
role_dict[key] = sorted(role_dict[key])
|
|
349
|
+
|
|
350
|
+
return permissions_by_role
|
|
351
|
+
|
|
352
|
+
def _get_permissions_for_space(self, space_id: int) -> Optional[List[dict]]:
|
|
353
|
+
if space_id in self._permissions_cache:
|
|
354
|
+
self._permissions_cache.move_to_end(space_id) # mark recent use
|
|
355
|
+
logger.debug(f"Retrieved cached permissions for space {space_id}")
|
|
356
|
+
return self._permissions_cache[space_id]
|
|
357
|
+
else:
|
|
358
|
+
with self.connection_config.get_client() as client:
|
|
359
|
+
try:
|
|
360
|
+
# TODO limit the total number of results being called.
|
|
361
|
+
# not yet implemented because this client call doesn't allow for filtering for
|
|
362
|
+
# certain operations, so adding a limit here would result in too little data.
|
|
363
|
+
space_permissions = []
|
|
364
|
+
space_permissions_result = client.get(f"/api/v2/spaces/{space_id}/permissions")
|
|
365
|
+
space_permissions.extend(space_permissions_result["results"])
|
|
366
|
+
if space_permissions_result["_links"].get("next"): # pagination
|
|
367
|
+
while space_permissions_result.get("next"):
|
|
368
|
+
space_permissions_result = client.get(space_permissions_result["next"])
|
|
369
|
+
space_permissions.extend(space_permissions_result["results"])
|
|
370
|
+
|
|
371
|
+
if len(self._permissions_cache) >= self._permissions_cache_max_size:
|
|
372
|
+
self._permissions_cache.popitem(last=False) # LRU/FIFO eviction
|
|
373
|
+
self._permissions_cache[space_id] = space_permissions
|
|
374
|
+
|
|
375
|
+
logger.debug(f"Retrieved permissions for space {space_id}")
|
|
376
|
+
return space_permissions
|
|
377
|
+
except Exception as e:
|
|
378
|
+
logger.debug(f"Could not retrieve permissions for space {space_id}: {e}")
|
|
379
|
+
return None
|
|
380
|
+
|
|
381
|
+
def _parse_permissions_for_doc(self, doc_id: str, space_permissions: list) -> Optional[dict]:
|
|
382
|
+
with self.connection_config.get_client() as client:
|
|
383
|
+
try:
|
|
384
|
+
doc_permissions = client.get_all_restrictions_for_content(content_id=doc_id)
|
|
385
|
+
parsed_permissions_dict = self.parse_permissions(doc_permissions, space_permissions)
|
|
386
|
+
|
|
387
|
+
except Exception as e:
|
|
388
|
+
# skip writing any permission metadata
|
|
389
|
+
logger.debug(f"Could not retrieve permissions for doc {doc_id}: {e}")
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
logger.debug(f"normalized permissions generated: {parsed_permissions_dict}")
|
|
393
|
+
return parsed_permissions_dict
|
|
394
|
+
|
|
236
395
|
def run(self, file_data: FileData, **kwargs) -> download_responses:
|
|
237
396
|
from bs4 import BeautifulSoup
|
|
238
397
|
|
|
@@ -268,6 +427,14 @@ class ConfluenceDownloader(Downloader):
|
|
|
268
427
|
soup = BeautifulSoup(content, "html.parser")
|
|
269
428
|
f.write(soup.prettify())
|
|
270
429
|
|
|
430
|
+
# Get document permissions and update metadata
|
|
431
|
+
space_id = file_data.additional_metadata["space_id"]
|
|
432
|
+
space_perm = self._get_permissions_for_space(space_id) # must be the id, NOT the space key
|
|
433
|
+
if space_perm:
|
|
434
|
+
combined_doc_permissions = self._parse_permissions_for_doc(doc_id, space_perm)
|
|
435
|
+
if combined_doc_permissions:
|
|
436
|
+
file_data.metadata.permissions_data = combined_doc_permissions
|
|
437
|
+
|
|
271
438
|
# Update file_data with metadata
|
|
272
439
|
file_data.metadata.date_created = page["history"]["createdDate"]
|
|
273
440
|
file_data.metadata.date_modified = page["version"]["when"]
|
|
@@ -391,6 +391,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
391
391
|
) -> list[FileData]:
|
|
392
392
|
root_info = self.get_root_info(files_client=files_client, object_id=object_id)
|
|
393
393
|
if not self.is_dir(root_info):
|
|
394
|
+
root_info["permissions"] = self.extract_permissions(root_info.get("permissions"))
|
|
394
395
|
data = [self.map_file_data(root_info)]
|
|
395
396
|
else:
|
|
396
397
|
file_contents = self.get_paginated_results(
|
|
@@ -400,11 +401,51 @@ class GoogleDriveIndexer(Indexer):
|
|
|
400
401
|
recursive=recursive,
|
|
401
402
|
previous_path=root_info["name"],
|
|
402
403
|
)
|
|
403
|
-
data = [
|
|
404
|
+
data = []
|
|
405
|
+
for f in file_contents:
|
|
406
|
+
f["permissions"] = self.extract_permissions(f.get("permissions"))
|
|
407
|
+
data.append(self.map_file_data(f=f))
|
|
404
408
|
for d in data:
|
|
405
409
|
d.metadata.record_locator["drive_id"]: object_id
|
|
406
410
|
return data
|
|
407
411
|
|
|
412
|
+
def extract_permissions(self, permissions: Optional[list[dict]]) -> dict:
|
|
413
|
+
if not permissions:
|
|
414
|
+
logger.debug("no permissions found")
|
|
415
|
+
return {}
|
|
416
|
+
|
|
417
|
+
# https://developers.google.com/workspace/drive/api/guides/ref-roles
|
|
418
|
+
role_mapping = {
|
|
419
|
+
"owner": ["read", "update", "delete"],
|
|
420
|
+
"organizer": ["read", "update", "delete"],
|
|
421
|
+
"fileOrganizer": ["read", "update"],
|
|
422
|
+
"writer": ["read", "update"],
|
|
423
|
+
"commenter": ["read"],
|
|
424
|
+
"reader": ["read"],
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
normalized_permissions = {
|
|
428
|
+
"read": {"users": set(), "groups": set()},
|
|
429
|
+
"update": {"users": set(), "groups": set()},
|
|
430
|
+
"delete": {"users": set(), "groups": set()},
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
for item in permissions:
|
|
434
|
+
# https://developers.google.com/workspace/drive/api/reference/rest/v3/permissions
|
|
435
|
+
# ignore permissions for "anyone" and "domain"
|
|
436
|
+
if item["type"] in ["user", "group"]:
|
|
437
|
+
type_key = item["type"] + "s"
|
|
438
|
+
for operation in role_mapping[item["role"]]:
|
|
439
|
+
normalized_permissions[operation][type_key].add(item["id"])
|
|
440
|
+
|
|
441
|
+
# turn sets into sorted lists for consistency and json serialization
|
|
442
|
+
for role_dict in normalized_permissions.values():
|
|
443
|
+
for key in role_dict:
|
|
444
|
+
role_dict[key] = sorted(role_dict[key])
|
|
445
|
+
|
|
446
|
+
logger.debug(f"normalized permissions generated: {normalized_permissions}")
|
|
447
|
+
return normalized_permissions
|
|
448
|
+
|
|
408
449
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
409
450
|
with self.connection_config.get_client() as client:
|
|
410
451
|
for f in self.get_files(
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.0.2" # pragma: no cover
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/dest.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/importer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/utils/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/utils/click.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/data_types/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/azure_openai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/bedrock.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/huggingface.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/interfaces.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/mixedbreadai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/togetherai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/vertexai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/voyageai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/connector.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/downloader.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/indexer.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/process.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/processor.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/interfaces.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/otel.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/chunk.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/embed.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/filter.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/stage.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/upload.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|