unstructured-ingest 1.0.1__tar.gz → 1.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/PKG-INFO +2 -2
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/pyproject.toml +0 -1
- unstructured_ingest-1.0.4/unstructured_ingest/__version__.py +1 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/data_types/file_data.py +2 -2
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/astradb.py +47 -33
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/confluence.py +180 -16
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/google_drive.py +40 -1
- unstructured_ingest-1.0.1/unstructured_ingest/__version__.py +0 -1
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/.gitignore +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/LICENSE.md +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/README.md +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/README.md +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/base/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/base/cmd.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/base/dest.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/base/importer.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/base/src.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/cli.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/cmds.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/utils/click.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/data_types/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/azure_openai.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/bedrock.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/huggingface.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/interfaces.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/mixedbreadai.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/octoai.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/openai.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/togetherai.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/vertexai.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/voyageai.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/error.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/errors_v2.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/connector.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/downloader.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/indexer.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/process.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/processor.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/upload_stager.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/uploader.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/logger.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/main.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/otel.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/interfaces.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/otel.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/pipeline.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/steps/download.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/steps/embed.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/steps/filter.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/steps/index.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/steps/partition.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/steps/stage.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/pipeline/steps/upload.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/chunker.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connector_registry.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/airtable.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/chroma.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/discord.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/github.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/jira.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/local.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/milvus.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/onedrive.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/outlook.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/pinecone.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/sharepoint.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/slack.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/utils.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/vectara.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/embedder.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/filter.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/partitioner.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/uncompress.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/unstructured_api.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/chunking.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/compression.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/constants.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/data_prep.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/dep_check.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/html.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/ndjson.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/pydantic_models.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
- {unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/utils/table.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unstructured_ingest
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.4
|
|
4
4
|
Summary: Local ETL data pipeline to get data RAG ready
|
|
5
5
|
Author-email: Unstructured Technologies <devops@unstructuredai.io>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -28,7 +28,7 @@ Provides-Extra: airtable
|
|
|
28
28
|
Requires-Dist: pandas; extra == 'airtable'
|
|
29
29
|
Requires-Dist: pyairtable; extra == 'airtable'
|
|
30
30
|
Provides-Extra: astradb
|
|
31
|
-
Requires-Dist: astrapy; extra == 'astradb'
|
|
31
|
+
Requires-Dist: astrapy>2.0.0; extra == 'astradb'
|
|
32
32
|
Provides-Extra: azure
|
|
33
33
|
Requires-Dist: adlfs; extra == 'azure'
|
|
34
34
|
Requires-Dist: fsspec; extra == 'azure'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.4" # pragma: no cover
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/data_types/file_data.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Any, Optional
|
|
3
|
+
from typing import Any, Optional, Union
|
|
4
4
|
from uuid import NAMESPACE_DNS, uuid5
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
|
@@ -29,7 +29,7 @@ class FileDataSourceMetadata(BaseModel):
|
|
|
29
29
|
date_created: Optional[str] = None
|
|
30
30
|
date_modified: Optional[str] = None
|
|
31
31
|
date_processed: Optional[str] = None
|
|
32
|
-
permissions_data:
|
|
32
|
+
permissions_data: Union[list[dict[str, Any]], dict[str, Any], None] = None
|
|
33
33
|
filesize_bytes: Optional[int] = None
|
|
34
34
|
|
|
35
35
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import csv
|
|
2
3
|
import hashlib
|
|
4
|
+
import os
|
|
3
5
|
import re
|
|
4
6
|
from dataclasses import dataclass, field
|
|
5
7
|
from pathlib import Path
|
|
@@ -8,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
|
8
10
|
|
|
9
11
|
from pydantic import BaseModel, Field, Secret
|
|
10
12
|
|
|
11
|
-
from unstructured_ingest import __name__ as integration_name
|
|
12
13
|
from unstructured_ingest.__version__ import __version__ as integration_version
|
|
13
14
|
from unstructured_ingest.data_types.file_data import (
|
|
14
15
|
BatchFileData,
|
|
@@ -83,10 +84,8 @@ class AstraDBConnectionConfig(ConnectionConfig):
|
|
|
83
84
|
|
|
84
85
|
# Create a client object to interact with the Astra DB
|
|
85
86
|
# caller_name/version for Astra DB tracking
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
caller_version=integration_version,
|
|
89
|
-
)
|
|
87
|
+
user_agent = os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
|
|
88
|
+
return AstraDBClient(callers=[(user_agent, integration_version)])
|
|
90
89
|
|
|
91
90
|
|
|
92
91
|
def get_astra_db(
|
|
@@ -141,7 +140,7 @@ async def get_async_astra_collection(
|
|
|
141
140
|
)
|
|
142
141
|
|
|
143
142
|
# Get async collection from AsyncDatabase
|
|
144
|
-
async_astra_db_collection =
|
|
143
|
+
async_astra_db_collection = async_astra_db.get_collection(name=collection_name)
|
|
145
144
|
return async_astra_db_collection
|
|
146
145
|
|
|
147
146
|
|
|
@@ -360,13 +359,22 @@ class AstraDBUploader(Uploader):
|
|
|
360
359
|
upload_config: AstraDBUploaderConfig
|
|
361
360
|
connector_type: str = CONNECTOR_TYPE
|
|
362
361
|
|
|
362
|
+
def is_async(self) -> bool:
|
|
363
|
+
return True
|
|
364
|
+
|
|
363
365
|
def init(self, **kwargs: Any) -> None:
|
|
364
366
|
self.create_destination(**kwargs)
|
|
365
367
|
|
|
368
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
366
369
|
def precheck(self) -> None:
|
|
367
370
|
try:
|
|
368
371
|
if self.upload_config.collection_name:
|
|
369
|
-
|
|
372
|
+
collection = get_astra_collection(
|
|
373
|
+
connection_config=self.connection_config,
|
|
374
|
+
collection_name=self.upload_config.collection_name,
|
|
375
|
+
keyspace=self.upload_config.keyspace,
|
|
376
|
+
)
|
|
377
|
+
collection.options()
|
|
370
378
|
else:
|
|
371
379
|
# check for db connection only if collection name is not provided
|
|
372
380
|
get_astra_db(
|
|
@@ -377,17 +385,7 @@ class AstraDBUploader(Uploader):
|
|
|
377
385
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
378
386
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
379
387
|
|
|
380
|
-
@requires_dependencies(["astrapy"], extras="astradb")
|
|
381
|
-
def get_collection(self, collection_name: Optional[str] = None) -> "AstraDBCollection":
|
|
382
|
-
return get_astra_collection(
|
|
383
|
-
connection_config=self.connection_config,
|
|
384
|
-
collection_name=collection_name or self.upload_config.collection_name,
|
|
385
|
-
keyspace=self.upload_config.keyspace,
|
|
386
|
-
)
|
|
387
|
-
|
|
388
388
|
def _collection_exists(self, collection_name: str):
|
|
389
|
-
from astrapy.exceptions import CollectionNotFoundException
|
|
390
|
-
|
|
391
389
|
collection = get_astra_collection(
|
|
392
390
|
connection_config=self.connection_config,
|
|
393
391
|
collection_name=collection_name,
|
|
@@ -397,8 +395,10 @@ class AstraDBUploader(Uploader):
|
|
|
397
395
|
try:
|
|
398
396
|
collection.options()
|
|
399
397
|
return True
|
|
400
|
-
except
|
|
401
|
-
|
|
398
|
+
except RuntimeError as e:
|
|
399
|
+
if "not found" in str(e):
|
|
400
|
+
return False
|
|
401
|
+
raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
|
|
402
402
|
except Exception as e:
|
|
403
403
|
logger.error(f"failed to check if astra collection exists : {e}")
|
|
404
404
|
raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
|
|
@@ -422,6 +422,8 @@ class AstraDBUploader(Uploader):
|
|
|
422
422
|
self.upload_config.collection_name = collection_name
|
|
423
423
|
|
|
424
424
|
if not self._collection_exists(collection_name):
|
|
425
|
+
from astrapy.info import CollectionDefinition
|
|
426
|
+
|
|
425
427
|
astra_db = get_astra_db(
|
|
426
428
|
connection_config=self.connection_config, keyspace=self.upload_config.keyspace
|
|
427
429
|
)
|
|
@@ -429,44 +431,56 @@ class AstraDBUploader(Uploader):
|
|
|
429
431
|
f"creating default astra collection '{collection_name}' with dimension "
|
|
430
432
|
f"{vector_length} and metric {similarity_metric}"
|
|
431
433
|
)
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
dimension=vector_length
|
|
435
|
-
|
|
434
|
+
definition = (
|
|
435
|
+
CollectionDefinition.builder()
|
|
436
|
+
.set_vector_dimension(dimension=vector_length)
|
|
437
|
+
.set_vector_metric(similarity_metric)
|
|
438
|
+
.build()
|
|
436
439
|
)
|
|
440
|
+
(astra_db.create_collection(collection_name, definition=definition),)
|
|
437
441
|
return True
|
|
438
442
|
logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
|
|
439
443
|
return False
|
|
440
444
|
|
|
441
|
-
def delete_by_record_id(self, collection: "
|
|
445
|
+
async def delete_by_record_id(self, collection: "AstraDBAsyncCollection", file_data: FileData):
|
|
442
446
|
logger.debug(
|
|
443
447
|
f"deleting records from collection {collection.name} "
|
|
444
448
|
f"with {self.upload_config.record_id_key} "
|
|
445
449
|
f"set to {file_data.identifier}"
|
|
446
450
|
)
|
|
447
451
|
delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
|
|
448
|
-
delete_resp = collection.delete_many(filter=delete_filter)
|
|
452
|
+
delete_resp = await collection.delete_many(filter=delete_filter)
|
|
449
453
|
logger.debug(
|
|
450
454
|
f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
|
|
451
455
|
)
|
|
452
456
|
|
|
453
|
-
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
457
|
+
async def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
454
458
|
logger.info(
|
|
455
459
|
f"writing {len(data)} objects to destination "
|
|
456
460
|
f"collection {self.upload_config.collection_name}"
|
|
457
461
|
)
|
|
458
462
|
|
|
459
463
|
astra_db_batch_size = self.upload_config.batch_size
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
464
|
+
async_astra_collection = await get_async_astra_collection(
|
|
465
|
+
connection_config=self.connection_config,
|
|
466
|
+
collection_name=self.upload_config.collection_name,
|
|
467
|
+
keyspace=self.upload_config.keyspace,
|
|
468
|
+
)
|
|
463
469
|
|
|
464
|
-
|
|
465
|
-
|
|
470
|
+
await self.delete_by_record_id(collection=async_astra_collection, file_data=file_data)
|
|
471
|
+
await asyncio.gather(
|
|
472
|
+
*[
|
|
473
|
+
async_astra_collection.insert_many(chunk)
|
|
474
|
+
for chunk in batch_generator(data, astra_db_batch_size)
|
|
475
|
+
]
|
|
476
|
+
)
|
|
466
477
|
|
|
467
|
-
def
|
|
478
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
468
479
|
data = get_json_data(path=path)
|
|
469
|
-
self.run_data(data=data, file_data=file_data
|
|
480
|
+
await self.run_data(data=data, file_data=file_data)
|
|
481
|
+
|
|
482
|
+
def run(self, **kwargs: Any) -> Any:
|
|
483
|
+
raise NotImplementedError("Use astradb run_async instead")
|
|
470
484
|
|
|
471
485
|
|
|
472
486
|
astra_db_source_entry = SourceRegistryEntry(
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
1
2
|
from contextlib import contextmanager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from pathlib import Path
|
|
4
|
-
from typing import TYPE_CHECKING, Generator, List, Optional
|
|
5
|
+
from typing import TYPE_CHECKING, Generator, List, Optional, Tuple
|
|
5
6
|
|
|
6
7
|
from pydantic import Field, Secret
|
|
7
8
|
|
|
@@ -135,35 +136,46 @@ class ConfluenceIndexer(Indexer):
|
|
|
135
136
|
logger.error(f"Failed to connect to Confluence: {e}", exc_info=True)
|
|
136
137
|
raise SourceConnectionError(f"Failed to connect to Confluence: {e}")
|
|
137
138
|
|
|
138
|
-
def
|
|
139
|
+
def _get_space_ids_and_keys(self) -> List[Tuple[str, int]]:
|
|
140
|
+
"""
|
|
141
|
+
Get a list of space IDs and keys from Confluence.
|
|
142
|
+
|
|
143
|
+
Example space ID (numerical): 98503
|
|
144
|
+
Example space key (str): "SD"
|
|
145
|
+
"""
|
|
139
146
|
spaces = self.index_config.spaces
|
|
140
147
|
if spaces:
|
|
141
|
-
|
|
148
|
+
with self.connection_config.get_client() as client:
|
|
149
|
+
space_ids_and_keys = []
|
|
150
|
+
for space_key in spaces:
|
|
151
|
+
space = client.get_space(space_key)
|
|
152
|
+
space_ids_and_keys.append((space_key, space["id"]))
|
|
153
|
+
return space_ids_and_keys
|
|
142
154
|
else:
|
|
143
155
|
with self.connection_config.get_client() as client:
|
|
144
156
|
all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
|
|
145
|
-
|
|
146
|
-
return
|
|
157
|
+
space_ids_and_keys = [(space["key"], space["id"]) for space in all_spaces["results"]]
|
|
158
|
+
return space_ids_and_keys
|
|
147
159
|
|
|
148
|
-
def _get_docs_ids_within_one_space(self,
|
|
160
|
+
def _get_docs_ids_within_one_space(self, space_key: str) -> List[dict]:
|
|
149
161
|
with self.connection_config.get_client() as client:
|
|
150
162
|
pages = client.get_all_pages_from_space(
|
|
151
|
-
space=
|
|
163
|
+
space=space_key,
|
|
152
164
|
start=0,
|
|
153
165
|
limit=self.index_config.max_num_of_docs_from_each_space,
|
|
154
166
|
expand=None,
|
|
155
|
-
content_type="page",
|
|
167
|
+
content_type="page", # blogpost and comment types not currently supported
|
|
156
168
|
status=None,
|
|
157
169
|
)
|
|
158
|
-
doc_ids = [{"space_id":
|
|
170
|
+
doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in pages]
|
|
159
171
|
return doc_ids
|
|
160
172
|
|
|
161
173
|
def run(self) -> Generator[FileData, None, None]:
|
|
162
174
|
from time import time
|
|
163
175
|
|
|
164
|
-
|
|
165
|
-
for space_id in
|
|
166
|
-
doc_ids = self._get_docs_ids_within_one_space(
|
|
176
|
+
space_ids_and_keys = self._get_space_ids_and_keys()
|
|
177
|
+
for space_key, space_id in space_ids_and_keys:
|
|
178
|
+
doc_ids = self._get_docs_ids_within_one_space(space_key)
|
|
167
179
|
for doc in doc_ids:
|
|
168
180
|
doc_id = doc["doc_id"]
|
|
169
181
|
# Build metadata
|
|
@@ -171,18 +183,19 @@ class ConfluenceIndexer(Indexer):
|
|
|
171
183
|
date_processed=str(time()),
|
|
172
184
|
url=f"{self.connection_config.url}/pages/{doc_id}",
|
|
173
185
|
record_locator={
|
|
174
|
-
"space_id":
|
|
186
|
+
"space_id": space_key,
|
|
175
187
|
"document_id": doc_id,
|
|
176
188
|
},
|
|
177
189
|
)
|
|
178
190
|
additional_metadata = {
|
|
179
|
-
"
|
|
191
|
+
"space_key": space_key,
|
|
192
|
+
"space_id": space_id, # diff from record_locator space_id (which is space_key)
|
|
180
193
|
"document_id": doc_id,
|
|
181
194
|
}
|
|
182
195
|
|
|
183
196
|
# Construct relative path and filename
|
|
184
197
|
filename = f"{doc_id}.html"
|
|
185
|
-
relative_path = str(Path(
|
|
198
|
+
relative_path = str(Path(space_key) / filename)
|
|
186
199
|
|
|
187
200
|
source_identifiers = SourceIdentifiers(
|
|
188
201
|
filename=filename,
|
|
@@ -201,7 +214,9 @@ class ConfluenceIndexer(Indexer):
|
|
|
201
214
|
|
|
202
215
|
|
|
203
216
|
class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
|
|
204
|
-
|
|
217
|
+
max_num_metadata_permissions: int = Field(
|
|
218
|
+
250, description="Approximate maximum number of permissions included in metadata"
|
|
219
|
+
)
|
|
205
220
|
|
|
206
221
|
|
|
207
222
|
@dataclass
|
|
@@ -209,6 +224,8 @@ class ConfluenceDownloader(Downloader):
|
|
|
209
224
|
connection_config: ConfluenceConnectionConfig
|
|
210
225
|
download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
|
|
211
226
|
connector_type: str = CONNECTOR_TYPE
|
|
227
|
+
_permissions_cache: dict = field(default_factory=OrderedDict)
|
|
228
|
+
_permissions_cache_max_size: int = 5
|
|
212
229
|
|
|
213
230
|
def download_embedded_files(
|
|
214
231
|
self, session, html: str, current_file_data: FileData
|
|
@@ -233,6 +250,145 @@ class ConfluenceDownloader(Downloader):
|
|
|
233
250
|
session=session,
|
|
234
251
|
)
|
|
235
252
|
|
|
253
|
+
def parse_permissions(self, doc_permissions: dict, space_permissions: list) -> dict[str, dict]:
|
|
254
|
+
"""
|
|
255
|
+
Parses document and space permissions to determine final user/group roles.
|
|
256
|
+
|
|
257
|
+
:param doc_permissions: dict containing document-level restrictions
|
|
258
|
+
- doc_permissions type in Confluence: ContentRestrictionArray
|
|
259
|
+
:param space_permissions: list of space-level permission assignments
|
|
260
|
+
- space_permissions type in Confluence: list of SpacePermissionAssignment
|
|
261
|
+
:return: dict with operation as keys and each maps to dict with "users" and "groups"
|
|
262
|
+
|
|
263
|
+
Get document permissions. If they exist, they will override space level permissions.
|
|
264
|
+
Otherwise, apply relevant space permissions (read, administer, delete)
|
|
265
|
+
"""
|
|
266
|
+
|
|
267
|
+
# Separate flags to track if view or edit is restricted at the page level
|
|
268
|
+
page_view_restricted = bool(
|
|
269
|
+
doc_permissions.get("read", {}).get("restrictions", {}).get("user", {}).get("results")
|
|
270
|
+
or doc_permissions.get("read", {})
|
|
271
|
+
.get("restrictions", {})
|
|
272
|
+
.get("group", {})
|
|
273
|
+
.get("results")
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
page_edit_restricted = bool(
|
|
277
|
+
doc_permissions.get("update", {}).get("restrictions", {}).get("user", {}).get("results")
|
|
278
|
+
or doc_permissions.get("update", {})
|
|
279
|
+
.get("restrictions", {})
|
|
280
|
+
.get("group", {})
|
|
281
|
+
.get("results")
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
permissions_by_role = {
|
|
285
|
+
"read": {"users": set(), "groups": set()},
|
|
286
|
+
"update": {"users": set(), "groups": set()},
|
|
287
|
+
"delete": {"users": set(), "groups": set()},
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
total_permissions = 0
|
|
291
|
+
|
|
292
|
+
for action, permissions in doc_permissions.items():
|
|
293
|
+
restrictions_dict = permissions.get("restrictions", {})
|
|
294
|
+
|
|
295
|
+
for entity_type, entity_data in restrictions_dict.items():
|
|
296
|
+
for entity in entity_data.get("results"):
|
|
297
|
+
entity_id = entity["accountId"] if entity_type == "user" else entity["id"]
|
|
298
|
+
permissions_by_role[action][f"{entity_type}s"].add(entity_id)
|
|
299
|
+
total_permissions += 1
|
|
300
|
+
# edit permission implies view permission
|
|
301
|
+
if action == "update":
|
|
302
|
+
permissions_by_role["read"][f"{entity_type}s"].add(entity_id)
|
|
303
|
+
# total_permissions += 1
|
|
304
|
+
# ^ omitting to not double count an entity.
|
|
305
|
+
# may result in a higher total count than max_num_metadata_permissions
|
|
306
|
+
|
|
307
|
+
for space_perm in space_permissions:
|
|
308
|
+
if total_permissions < self.download_config.max_num_metadata_permissions:
|
|
309
|
+
space_operation = space_perm["operation"]["key"]
|
|
310
|
+
space_target_type = space_perm["operation"]["targetType"]
|
|
311
|
+
space_entity_id = space_perm["principal"]["id"]
|
|
312
|
+
space_entity_type = space_perm["principal"]["type"]
|
|
313
|
+
|
|
314
|
+
# Apply space-level view permissions if no page restrictions exist
|
|
315
|
+
if (
|
|
316
|
+
space_target_type == "space"
|
|
317
|
+
and space_operation == "read"
|
|
318
|
+
and not page_view_restricted
|
|
319
|
+
):
|
|
320
|
+
permissions_by_role["read"][f"{space_entity_type}s"].add(space_entity_id)
|
|
321
|
+
total_permissions += 1
|
|
322
|
+
|
|
323
|
+
# Administer permission includes view + edit. Apply if not page restricted
|
|
324
|
+
elif space_target_type == "space" and space_operation == "administer":
|
|
325
|
+
if not page_view_restricted:
|
|
326
|
+
permissions_by_role["read"][f"{space_entity_type}s"].add(space_entity_id)
|
|
327
|
+
total_permissions += 1
|
|
328
|
+
if not page_edit_restricted:
|
|
329
|
+
permissions_by_role["update"][f"{space_entity_type}s"].add(
|
|
330
|
+
space_entity_id
|
|
331
|
+
)
|
|
332
|
+
# total_permissions += 1
|
|
333
|
+
# ^ omitting to not double count an entity.
|
|
334
|
+
# may result in a higher total count than max_num_metadata_permissions
|
|
335
|
+
|
|
336
|
+
# Add the "delete page" space permissions if there are other page permissions
|
|
337
|
+
elif (
|
|
338
|
+
space_target_type == "page"
|
|
339
|
+
and space_operation == "delete"
|
|
340
|
+
and space_entity_id in permissions_by_role["read"][f"{space_entity_type}s"]
|
|
341
|
+
):
|
|
342
|
+
permissions_by_role["delete"][f"{space_entity_type}s"].add(space_entity_id)
|
|
343
|
+
total_permissions += 1
|
|
344
|
+
|
|
345
|
+
# turn sets into sorted lists for consistency and json serialization
|
|
346
|
+
for role_dict in permissions_by_role.values():
|
|
347
|
+
for key in role_dict:
|
|
348
|
+
role_dict[key] = sorted(role_dict[key])
|
|
349
|
+
|
|
350
|
+
return permissions_by_role
|
|
351
|
+
|
|
352
|
+
def _get_permissions_for_space(self, space_id: int) -> Optional[List[dict]]:
|
|
353
|
+
if space_id in self._permissions_cache:
|
|
354
|
+
self._permissions_cache.move_to_end(space_id) # mark recent use
|
|
355
|
+
return self._permissions_cache[space_id]
|
|
356
|
+
else:
|
|
357
|
+
with self.connection_config.get_client() as client:
|
|
358
|
+
try:
|
|
359
|
+
# TODO limit the total number of results being called.
|
|
360
|
+
# not yet implemented because this client call doesn't allow for filtering for
|
|
361
|
+
# certain operations, so adding a limit here would result in too little data.
|
|
362
|
+
space_permissions = []
|
|
363
|
+
space_permissions_result = client.get(f"/api/v2/spaces/{space_id}/permissions")
|
|
364
|
+
space_permissions.extend(space_permissions_result["results"])
|
|
365
|
+
if space_permissions_result["_links"].get("next"): # pagination
|
|
366
|
+
while space_permissions_result.get("next"):
|
|
367
|
+
space_permissions_result = client.get(space_permissions_result["next"])
|
|
368
|
+
space_permissions.extend(space_permissions_result["results"])
|
|
369
|
+
|
|
370
|
+
if len(self._permissions_cache) >= self._permissions_cache_max_size:
|
|
371
|
+
self._permissions_cache.popitem(last=False) # LRU/FIFO eviction
|
|
372
|
+
self._permissions_cache[space_id] = space_permissions
|
|
373
|
+
|
|
374
|
+
return space_permissions
|
|
375
|
+
except Exception as e:
|
|
376
|
+
logger.debug(f"Could not retrieve permissions for space {space_id}: {e}")
|
|
377
|
+
return None
|
|
378
|
+
|
|
379
|
+
def _parse_permissions_for_doc(self, doc_id: str, space_permissions: list) -> Optional[dict]:
|
|
380
|
+
with self.connection_config.get_client() as client:
|
|
381
|
+
try:
|
|
382
|
+
doc_permissions = client.get_all_restrictions_for_content(content_id=doc_id)
|
|
383
|
+
parsed_permissions_dict = self.parse_permissions(doc_permissions, space_permissions)
|
|
384
|
+
|
|
385
|
+
except Exception as e:
|
|
386
|
+
# skip writing any permission metadata
|
|
387
|
+
logger.debug(f"Could not retrieve permissions for doc {doc_id}: {e}")
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
return parsed_permissions_dict
|
|
391
|
+
|
|
236
392
|
def run(self, file_data: FileData, **kwargs) -> download_responses:
|
|
237
393
|
from bs4 import BeautifulSoup
|
|
238
394
|
|
|
@@ -268,6 +424,14 @@ class ConfluenceDownloader(Downloader):
|
|
|
268
424
|
soup = BeautifulSoup(content, "html.parser")
|
|
269
425
|
f.write(soup.prettify())
|
|
270
426
|
|
|
427
|
+
# Get document permissions and update metadata
|
|
428
|
+
space_id = file_data.additional_metadata["space_id"]
|
|
429
|
+
space_perm = self._get_permissions_for_space(space_id) # must be the id, NOT the space key
|
|
430
|
+
if space_perm:
|
|
431
|
+
combined_doc_permissions = self._parse_permissions_for_doc(doc_id, space_perm)
|
|
432
|
+
if combined_doc_permissions:
|
|
433
|
+
file_data.metadata.permissions_data = combined_doc_permissions
|
|
434
|
+
|
|
271
435
|
# Update file_data with metadata
|
|
272
436
|
file_data.metadata.date_created = page["history"]["createdDate"]
|
|
273
437
|
file_data.metadata.date_modified = page["version"]["when"]
|
|
@@ -391,6 +391,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
391
391
|
) -> list[FileData]:
|
|
392
392
|
root_info = self.get_root_info(files_client=files_client, object_id=object_id)
|
|
393
393
|
if not self.is_dir(root_info):
|
|
394
|
+
root_info["permissions"] = self.extract_permissions(root_info.get("permissions"))
|
|
394
395
|
data = [self.map_file_data(root_info)]
|
|
395
396
|
else:
|
|
396
397
|
file_contents = self.get_paginated_results(
|
|
@@ -400,11 +401,49 @@ class GoogleDriveIndexer(Indexer):
|
|
|
400
401
|
recursive=recursive,
|
|
401
402
|
previous_path=root_info["name"],
|
|
402
403
|
)
|
|
403
|
-
data = [
|
|
404
|
+
data = []
|
|
405
|
+
for f in file_contents:
|
|
406
|
+
f["permissions"] = self.extract_permissions(f.get("permissions"))
|
|
407
|
+
data.append(self.map_file_data(f=f))
|
|
404
408
|
for d in data:
|
|
405
409
|
d.metadata.record_locator["drive_id"]: object_id
|
|
406
410
|
return data
|
|
407
411
|
|
|
412
|
+
def extract_permissions(self, permissions: list[dict]) -> dict:
|
|
413
|
+
if not permissions:
|
|
414
|
+
return {}
|
|
415
|
+
|
|
416
|
+
# https://developers.google.com/workspace/drive/api/guides/ref-roles
|
|
417
|
+
role_mapping = {
|
|
418
|
+
"owner": ["read", "update", "delete"],
|
|
419
|
+
"organizer": ["read", "update", "delete"],
|
|
420
|
+
"fileOrganizer": ["read", "update"],
|
|
421
|
+
"writer": ["read", "update"],
|
|
422
|
+
"commenter": ["read"],
|
|
423
|
+
"reader": ["read"],
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
normalized_permissions = {
|
|
427
|
+
"read": {"users": set(), "groups": set()},
|
|
428
|
+
"update": {"users": set(), "groups": set()},
|
|
429
|
+
"delete": {"users": set(), "groups": set()},
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
for item in permissions:
|
|
433
|
+
# https://developers.google.com/workspace/drive/api/reference/rest/v3/permissions
|
|
434
|
+
# ignore permissions for "anyone" and "domain"
|
|
435
|
+
if item["type"] in ["user", "group"]:
|
|
436
|
+
type_key = item["type"] + "s"
|
|
437
|
+
for operation in role_mapping[item["role"]]:
|
|
438
|
+
normalized_permissions[operation][type_key].add(item["id"])
|
|
439
|
+
|
|
440
|
+
# turn sets into sorted lists for consistency and json serialization
|
|
441
|
+
for role_dict in normalized_permissions.values():
|
|
442
|
+
for key in role_dict:
|
|
443
|
+
role_dict[key] = sorted(role_dict[key])
|
|
444
|
+
|
|
445
|
+
return normalized_permissions
|
|
446
|
+
|
|
408
447
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
409
448
|
with self.connection_config.get_client() as client:
|
|
410
449
|
for f in self.get_files(
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.0.1" # pragma: no cover
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/base/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/base/dest.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/base/importer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/utils/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/cli/utils/click.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/data_types/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/azure_openai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/bedrock.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/huggingface.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/interfaces.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/mixedbreadai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/togetherai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/vertexai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/embed/voyageai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/connector.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/downloader.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/indexer.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/process.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/processor.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.1 → unstructured_ingest-1.0.4}/unstructured_ingest/interfaces/uploader.py
RENAMED
|
File without changes
|
|
File without changes
|