unstructured-ingest 1.0.35__tar.gz → 1.0.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/PKG-INFO +1 -1
- unstructured_ingest-1.0.40/unstructured_ingest/__version__.py +1 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/confluence.py +20 -1
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +3 -4
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/milvus.py +81 -7
- unstructured_ingest-1.0.40/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/pinecone.py +1 -1
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sharepoint.py +32 -4
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/html.py +14 -1
- unstructured_ingest-1.0.35/unstructured_ingest/__version__.py +0 -1
- unstructured_ingest-1.0.35/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -57
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/.gitignore +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/LICENSE.md +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/README.md +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/pyproject.toml +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/README.md +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/cmd.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/dest.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/importer.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/src.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/cli.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/cmds.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/utils/click.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/data_types/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/data_types/entities.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/data_types/file_data.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/azure_openai.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/bedrock.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/huggingface.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/interfaces.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/mixedbreadai.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/octoai.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/openai.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/togetherai.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/vertexai.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/voyageai.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/error.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/errors_v2.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/connector.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/downloader.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/indexer.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/process.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/processor.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/upload_stager.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/uploader.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/logger.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/main.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/otel.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/interfaces.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/otel.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/pipeline.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/download.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/embed.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/filter.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/index.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/partition.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/stage.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/steps/upload.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/chunker.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connector_registry.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/airtable.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/astradb.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/chroma.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/discord.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/github.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/google_drive.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/jira.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/local.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/onedrive.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/outlook.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/slack.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/utils.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/vectara.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/embedder.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/filter.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/partitioner.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/uncompress.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/unstructured_api.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/chunking.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/compression.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/constants.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/data_prep.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/dep_check.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/ndjson.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/pydantic_models.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
- {unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/utils/table.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.40" # pragma: no cover
|
|
@@ -33,6 +33,8 @@ from unstructured_ingest.utils.string_and_date_utils import fix_unescaped_unicod
|
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
35
35
|
from atlassian import Confluence
|
|
36
|
+
from bs4 import BeautifulSoup
|
|
37
|
+
from bs4.element import Tag
|
|
36
38
|
|
|
37
39
|
CONNECTOR_TYPE = "confluence"
|
|
38
40
|
|
|
@@ -235,11 +237,28 @@ class ConfluenceIndexer(Indexer):
|
|
|
235
237
|
yield file_data
|
|
236
238
|
|
|
237
239
|
|
|
238
|
-
class ConfluenceDownloaderConfig(
|
|
240
|
+
class ConfluenceDownloaderConfig(HtmlMixin, DownloaderConfig):
|
|
239
241
|
max_num_metadata_permissions: int = Field(
|
|
240
242
|
250, description="Approximate maximum number of permissions included in metadata"
|
|
241
243
|
)
|
|
242
244
|
|
|
245
|
+
@requires_dependencies(["bs4"])
|
|
246
|
+
def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
|
|
247
|
+
from bs4.element import Tag
|
|
248
|
+
|
|
249
|
+
return [
|
|
250
|
+
element
|
|
251
|
+
for element in html_soup.find_all(
|
|
252
|
+
"a",
|
|
253
|
+
attrs={
|
|
254
|
+
"class": "confluence-embedded-file",
|
|
255
|
+
"data-linked-resource-type": "attachment",
|
|
256
|
+
"href": True,
|
|
257
|
+
},
|
|
258
|
+
)
|
|
259
|
+
if isinstance(element, Tag)
|
|
260
|
+
]
|
|
261
|
+
|
|
243
262
|
|
|
244
263
|
@dataclass
|
|
245
264
|
class ConfluenceDownloader(Downloader):
|
|
@@ -343,10 +343,9 @@ class FsspecUploader(Uploader):
|
|
|
343
343
|
raise self.wrap_error(e=e)
|
|
344
344
|
|
|
345
345
|
def get_upload_path(self, file_data: FileData) -> Path:
|
|
346
|
-
upload_path = (
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
)
|
|
346
|
+
upload_path = Path(
|
|
347
|
+
self.upload_config.path_without_protocol
|
|
348
|
+
) / file_data.source_identifiers.relative_path.lstrip("/")
|
|
350
349
|
updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
|
|
351
350
|
return updated_upload_path
|
|
352
351
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
6
|
from dateutil import parser
|
|
7
7
|
from pydantic import Field, Secret
|
|
@@ -97,10 +97,16 @@ class MilvusUploadStager(UploadStager):
|
|
|
97
97
|
|
|
98
98
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
99
99
|
working_data = element_dict.copy()
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
100
|
+
|
|
101
|
+
if self.upload_stager_config.flatten_metadata:
|
|
102
|
+
metadata: dict[str, Any] = working_data.pop("metadata", {})
|
|
103
|
+
flattened_metadata = flatten_dict(
|
|
104
|
+
metadata,
|
|
105
|
+
separator="_",
|
|
106
|
+
flatten_lists=False,
|
|
107
|
+
remove_none=True,
|
|
108
|
+
)
|
|
109
|
+
working_data.update(flattened_metadata)
|
|
104
110
|
|
|
105
111
|
# TODO: milvus sdk doesn't seem to support defaults via the schema yet,
|
|
106
112
|
# remove once that gets updated
|
|
@@ -154,6 +160,23 @@ class MilvusUploader(Uploader):
|
|
|
154
160
|
upload_config: MilvusUploaderConfig
|
|
155
161
|
connector_type: str = CONNECTOR_TYPE
|
|
156
162
|
|
|
163
|
+
def has_dynamic_fields_enabled(self) -> bool:
|
|
164
|
+
"""Check if the target collection has dynamic fields enabled."""
|
|
165
|
+
try:
|
|
166
|
+
with self.get_client() as client:
|
|
167
|
+
collection_info = client.describe_collection(self.upload_config.collection_name)
|
|
168
|
+
|
|
169
|
+
# Check if dynamic field is enabled
|
|
170
|
+
# The schema info should contain enable_dynamic_field or enableDynamicField
|
|
171
|
+
schema_info = collection_info.get(
|
|
172
|
+
"enable_dynamic_field",
|
|
173
|
+
collection_info.get("enableDynamicField", False),
|
|
174
|
+
)
|
|
175
|
+
return bool(schema_info)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.warning(f"Could not determine if collection has dynamic fields enabled: {e}")
|
|
178
|
+
return False
|
|
179
|
+
|
|
157
180
|
@DestinationConnectionError.wrap
|
|
158
181
|
def precheck(self):
|
|
159
182
|
from pymilvus import MilvusException
|
|
@@ -164,6 +187,7 @@ class MilvusUploader(Uploader):
|
|
|
164
187
|
raise DestinationConnectionError(
|
|
165
188
|
f"Collection '{self.upload_config.collection_name}' does not exist"
|
|
166
189
|
)
|
|
190
|
+
|
|
167
191
|
except MilvusException as milvus_exception:
|
|
168
192
|
raise DestinationConnectionError(
|
|
169
193
|
f"failed to precheck Milvus: {str(milvus_exception.message)}"
|
|
@@ -193,16 +217,66 @@ class MilvusUploader(Uploader):
|
|
|
193
217
|
)
|
|
194
218
|
|
|
195
219
|
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
196
|
-
def
|
|
220
|
+
def _prepare_data_for_insert(self, data: list[dict]) -> list[dict]:
|
|
221
|
+
"""
|
|
222
|
+
Conforms the provided data to the schema of the target Milvus collection.
|
|
223
|
+
- If dynamic fields are enabled, it ensures JSON-stringified fields are decoded.
|
|
224
|
+
- If dynamic fields are disabled, it filters out any fields not present in the schema.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
dynamic_fields_enabled = self.has_dynamic_fields_enabled()
|
|
228
|
+
|
|
229
|
+
# If dynamic fields are enabled, 'languages' field needs to be a list
|
|
230
|
+
if dynamic_fields_enabled:
|
|
231
|
+
logger.debug("Dynamic fields enabled, ensuring 'languages' field is a list.")
|
|
232
|
+
prepared_data = []
|
|
233
|
+
for item in data:
|
|
234
|
+
new_item = item.copy()
|
|
235
|
+
if "languages" in new_item and isinstance(new_item["languages"], str):
|
|
236
|
+
try:
|
|
237
|
+
new_item["languages"] = json.loads(new_item["languages"])
|
|
238
|
+
except (json.JSONDecodeError, TypeError):
|
|
239
|
+
logger.warning(
|
|
240
|
+
f"Could not JSON decode languages field: {new_item['languages']}. "
|
|
241
|
+
"Leaving as string.",
|
|
242
|
+
)
|
|
243
|
+
prepared_data.append(new_item)
|
|
244
|
+
return prepared_data
|
|
245
|
+
|
|
246
|
+
# If dynamic fields are not enabled, we need to filter out the metadata fields
|
|
247
|
+
# to avoid insertion errors for fields not defined in the schema
|
|
248
|
+
with self.get_client() as client:
|
|
249
|
+
collection_info = client.describe_collection(
|
|
250
|
+
self.upload_config.collection_name,
|
|
251
|
+
)
|
|
252
|
+
schema_fields = {
|
|
253
|
+
field["name"]
|
|
254
|
+
for field in collection_info.get("fields", [])
|
|
255
|
+
if not field.get("auto_id", False)
|
|
256
|
+
}
|
|
257
|
+
# Remove metadata fields that are not part of the base schema
|
|
258
|
+
filtered_data = []
|
|
259
|
+
for item in data:
|
|
260
|
+
filtered_item = {key: value for key, value in item.items() if key in schema_fields}
|
|
261
|
+
filtered_data.append(filtered_item)
|
|
262
|
+
return filtered_data
|
|
263
|
+
|
|
264
|
+
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
265
|
+
def insert_results(self, data: list[dict]):
|
|
197
266
|
from pymilvus import MilvusException
|
|
198
267
|
|
|
199
268
|
logger.info(
|
|
200
269
|
f"uploading {len(data)} entries to {self.connection_config.db_name} "
|
|
201
270
|
f"db in collection {self.upload_config.collection_name}"
|
|
202
271
|
)
|
|
272
|
+
|
|
273
|
+
prepared_data = self._prepare_data_for_insert(data=data)
|
|
274
|
+
|
|
203
275
|
with self.get_client() as client:
|
|
204
276
|
try:
|
|
205
|
-
res = client.insert(
|
|
277
|
+
res = client.insert(
|
|
278
|
+
collection_name=self.upload_config.collection_name, data=prepared_data
|
|
279
|
+
)
|
|
206
280
|
except MilvusException as milvus_exception:
|
|
207
281
|
raise WriteError(
|
|
208
282
|
f"failed to upload records to Milvus: {str(milvus_exception.message)}"
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# https://developers.notion.com/reference/block#synced-block
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
from htmlBuilder.tags import HtmlTag
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class OriginalSyncedBlock(BlockBase):
|
|
12
|
+
synced_from: Optional[str] = None
|
|
13
|
+
children: List[dict] = field(default_factory=list)
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def can_have_children() -> bool:
|
|
17
|
+
return True
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def from_dict(cls, data: dict):
|
|
21
|
+
"""Create OriginalSyncedBlock from dictionary data.
|
|
22
|
+
|
|
23
|
+
Original blocks contain children content.
|
|
24
|
+
"""
|
|
25
|
+
if "children" not in data:
|
|
26
|
+
raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
|
|
27
|
+
return cls(children=data["children"])
|
|
28
|
+
|
|
29
|
+
def get_html(self) -> Optional[HtmlTag]:
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class DuplicateSyncedBlock(BlockBase):
|
|
35
|
+
type: str
|
|
36
|
+
block_id: str
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def can_have_children() -> bool:
|
|
40
|
+
"""Check if duplicate synced blocks can have children.
|
|
41
|
+
|
|
42
|
+
Duplicate blocks themselves don't have children directly fetched here,
|
|
43
|
+
but they represent content that does, so Notion API might report has_children=True
|
|
44
|
+
on the parent block object. The actual children are fetched from the original block.
|
|
45
|
+
"""
|
|
46
|
+
return True
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def from_dict(cls, data: dict):
|
|
50
|
+
"""Create DuplicateSyncedBlock from dictionary data.
|
|
51
|
+
|
|
52
|
+
Duplicate blocks contain a 'synced_from' reference.
|
|
53
|
+
"""
|
|
54
|
+
synced_from_data = data.get("synced_from")
|
|
55
|
+
if not synced_from_data or not isinstance(synced_from_data, dict):
|
|
56
|
+
raise ValueError(f"Invalid data structure for DuplicateSyncedBlock: {data}")
|
|
57
|
+
# Ensure required keys are present in the nested dictionary
|
|
58
|
+
if "type" not in synced_from_data or "block_id" not in synced_from_data:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"Missing 'type' or 'block_id' in synced_from data: {synced_from_data}"
|
|
61
|
+
)
|
|
62
|
+
return cls(type=synced_from_data["type"], block_id=synced_from_data["block_id"])
|
|
63
|
+
|
|
64
|
+
def get_html(self) -> Optional[HtmlTag]:
|
|
65
|
+
"""Get HTML representation of the duplicate synced block.
|
|
66
|
+
|
|
67
|
+
HTML representation might need fetching the original block's content,
|
|
68
|
+
which is outside the scope of this simple data class.
|
|
69
|
+
"""
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class SyncBlock(BlockBase):
|
|
74
|
+
@staticmethod
|
|
75
|
+
def can_have_children() -> bool:
|
|
76
|
+
"""Check if synced blocks can have children.
|
|
77
|
+
|
|
78
|
+
Synced blocks (both original and duplicate) can conceptually have children.
|
|
79
|
+
"""
|
|
80
|
+
return True
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def from_dict(cls, data: dict):
|
|
84
|
+
"""Create appropriate SyncedBlock subclass from dictionary data.
|
|
85
|
+
|
|
86
|
+
Determine if it's a duplicate (has 'synced_from') or original (has 'children').
|
|
87
|
+
"""
|
|
88
|
+
if data.get("synced_from") is not None:
|
|
89
|
+
# It's a duplicate block containing a reference
|
|
90
|
+
return DuplicateSyncedBlock.from_dict(data)
|
|
91
|
+
elif "children" in data:
|
|
92
|
+
# It's an original block containing children
|
|
93
|
+
return OriginalSyncedBlock.from_dict(data)
|
|
94
|
+
else:
|
|
95
|
+
# Handle cases where neither 'synced_from' nor 'children' are present.
|
|
96
|
+
# Notion API might return this for an empty original synced block.
|
|
97
|
+
# Let's treat it as an empty OriginalSyncedBlock.
|
|
98
|
+
# If this assumption is wrong, errors might occur later.
|
|
99
|
+
# Consider logging a warning here if strictness is needed.
|
|
100
|
+
return OriginalSyncedBlock(children=[])
|
|
101
|
+
|
|
102
|
+
def get_html(self) -> Optional[HtmlTag]:
|
|
103
|
+
"""Get HTML representation of the synced block.
|
|
104
|
+
|
|
105
|
+
The specific instance returned by from_dict (Original or Duplicate)
|
|
106
|
+
will handle its own get_html logic.
|
|
107
|
+
This method on the base SyncBlock might not be directly called.
|
|
108
|
+
"""
|
|
109
|
+
return None
|
|
@@ -240,7 +240,7 @@ class PineconeUploader(VectorDBUploader):
|
|
|
240
240
|
destination_name: str = "unstructuredautocreated",
|
|
241
241
|
destination_type: Literal["pod", "serverless"] = "serverless",
|
|
242
242
|
serverless_cloud: str = "aws",
|
|
243
|
-
serverless_region: str = "us-
|
|
243
|
+
serverless_region: str = "us-east-1",
|
|
244
244
|
pod_environment: str = "us-east1-gcp",
|
|
245
245
|
pod_type: str = "p1.x1",
|
|
246
246
|
pod_count: int = 1,
|
|
@@ -29,6 +29,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
32
|
+
from office365.onedrive.sites.site import Site
|
|
32
33
|
|
|
33
34
|
CONNECTOR_TYPE = "sharepoint"
|
|
34
35
|
LEGACY_DEFAULT_PATH = "Shared Documents"
|
|
@@ -51,6 +52,33 @@ class SharepointConnectionConfig(OnedriveConnectionConfig):
|
|
|
51
52
|
https://[tenant]-admin.sharepoint.com.\
|
|
52
53
|
This requires the app to be registered at a tenant level"
|
|
53
54
|
)
|
|
55
|
+
library: Optional[str] = Field(
|
|
56
|
+
default=None,
|
|
57
|
+
description="Sharepoint library name. If not provided, the default \
|
|
58
|
+
drive will be used.",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def _get_drive_item(self, client_site: Site) -> DriveItem:
|
|
62
|
+
"""Helper method to get the drive item for the specified library or default drive."""
|
|
63
|
+
site_drive_item = None
|
|
64
|
+
if self.library:
|
|
65
|
+
for drive in client_site.drives.get().execute_query():
|
|
66
|
+
if drive.name == self.library:
|
|
67
|
+
logger.info(f"Found the requested library: {self.library}")
|
|
68
|
+
site_drive_item = drive.get().execute_query().root
|
|
69
|
+
break
|
|
70
|
+
|
|
71
|
+
# If no specific library was found or requested, use the default drive
|
|
72
|
+
if not site_drive_item:
|
|
73
|
+
if self.library:
|
|
74
|
+
logger.warning(
|
|
75
|
+
f"Library '{self.library}' not found in site '{self.site}'. "
|
|
76
|
+
"Using the default drive instead."
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
site_drive_item = client_site.drive.get().execute_query().root
|
|
80
|
+
|
|
81
|
+
return site_drive_item
|
|
54
82
|
|
|
55
83
|
|
|
56
84
|
class SharepointIndexerConfig(OnedriveIndexerConfig):
|
|
@@ -76,8 +104,8 @@ class SharepointIndexer(OnedriveIndexer):
|
|
|
76
104
|
|
|
77
105
|
client = await asyncio.to_thread(self.connection_config.get_client)
|
|
78
106
|
try:
|
|
79
|
-
|
|
80
|
-
site_drive_item =
|
|
107
|
+
client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
108
|
+
site_drive_item = self.connection_config._get_drive_item(client_site)
|
|
81
109
|
except ClientRequestException:
|
|
82
110
|
logger.info("Site not found")
|
|
83
111
|
|
|
@@ -118,8 +146,8 @@ class SharepointDownloader(OnedriveDownloader):
|
|
|
118
146
|
client = self.connection_config.get_client()
|
|
119
147
|
|
|
120
148
|
try:
|
|
121
|
-
|
|
122
|
-
site_drive_item =
|
|
149
|
+
client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
150
|
+
site_drive_item = self.connection_config._get_drive_item(client_site)
|
|
123
151
|
except ClientRequestException:
|
|
124
152
|
logger.info("Site not found")
|
|
125
153
|
file = site_drive_item.get_by_path(server_relative_path).get().execute_query()
|
|
@@ -12,6 +12,7 @@ from unstructured_ingest.logger import logger
|
|
|
12
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
|
+
from bs4 import BeautifulSoup
|
|
15
16
|
from bs4.element import Tag
|
|
16
17
|
from requests import Session
|
|
17
18
|
|
|
@@ -96,7 +97,7 @@ class HtmlMixin(BaseModel):
|
|
|
96
97
|
from bs4 import BeautifulSoup
|
|
97
98
|
|
|
98
99
|
soup = BeautifulSoup(html, "html.parser")
|
|
99
|
-
tags =
|
|
100
|
+
tags = self._find_hyperlink_tags(soup)
|
|
100
101
|
hrefs = [
|
|
101
102
|
tag["href"]
|
|
102
103
|
for tag in tags
|
|
@@ -158,3 +159,15 @@ class HtmlMixin(BaseModel):
|
|
|
158
159
|
)
|
|
159
160
|
for url_to_download in urls_to_download
|
|
160
161
|
]
|
|
162
|
+
|
|
163
|
+
@requires_dependencies(["bs4"])
|
|
164
|
+
def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
|
|
165
|
+
"""Find hyperlink tags in the HTML.
|
|
166
|
+
|
|
167
|
+
Overwrite this method to customize the tag search.
|
|
168
|
+
"""
|
|
169
|
+
from bs4.element import Tag
|
|
170
|
+
|
|
171
|
+
return [
|
|
172
|
+
element for element in html_soup.find_all("a", href=True) if isinstance(element, Tag)
|
|
173
|
+
]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.0.35" # pragma: no cover
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
# https://developers.notion.com/reference/block#synced-block
|
|
2
|
-
from dataclasses import dataclass, field
|
|
3
|
-
from typing import List, Optional
|
|
4
|
-
|
|
5
|
-
from htmlBuilder.tags import HtmlTag
|
|
6
|
-
|
|
7
|
-
from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class OriginalSyncedBlock(BlockBase):
|
|
12
|
-
synced_from: Optional[str] = None
|
|
13
|
-
children: List[dict] = field(default_factory=list)
|
|
14
|
-
|
|
15
|
-
@staticmethod
|
|
16
|
-
def can_have_children() -> bool:
|
|
17
|
-
return True
|
|
18
|
-
|
|
19
|
-
@classmethod
|
|
20
|
-
def from_dict(cls, data: dict):
|
|
21
|
-
return cls(children=data["children"])
|
|
22
|
-
|
|
23
|
-
def get_html(self) -> Optional[HtmlTag]:
|
|
24
|
-
return None
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dataclass
|
|
28
|
-
class DuplicateSyncedBlock(BlockBase):
|
|
29
|
-
type: str
|
|
30
|
-
block_id: str
|
|
31
|
-
|
|
32
|
-
@staticmethod
|
|
33
|
-
def can_have_children() -> bool:
|
|
34
|
-
return True
|
|
35
|
-
|
|
36
|
-
@classmethod
|
|
37
|
-
def from_dict(cls, data: dict):
|
|
38
|
-
return cls(**data)
|
|
39
|
-
|
|
40
|
-
def get_html(self) -> Optional[HtmlTag]:
|
|
41
|
-
return None
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class SyncBlock(BlockBase):
|
|
45
|
-
@staticmethod
|
|
46
|
-
def can_have_children() -> bool:
|
|
47
|
-
return True
|
|
48
|
-
|
|
49
|
-
@classmethod
|
|
50
|
-
def from_dict(cls, data: dict):
|
|
51
|
-
if "synced_from" in data:
|
|
52
|
-
return OriginalSyncedBlock.from_dict(data)
|
|
53
|
-
else:
|
|
54
|
-
return DuplicateSyncedBlock.from_dict(data)
|
|
55
|
-
|
|
56
|
-
def get_html(self) -> Optional[HtmlTag]:
|
|
57
|
-
return None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/cmd.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/dest.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/importer.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/base/src.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/utils/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/cli/utils/click.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/data_types/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/data_types/entities.py
RENAMED
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/azure_openai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/bedrock.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/huggingface.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/interfaces.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/mixedbreadai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/octoai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/openai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/togetherai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/vertexai.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/embed/voyageai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/indexer.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/process.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/interfaces/uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/interfaces.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/otel.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/pipeline/pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/__init__.py
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.35 → unstructured_ingest-1.0.40}/unstructured_ingest/processes/chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|