unstructured-ingest 1.0.37__tar.gz → 1.0.41__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/PKG-INFO +1 -1
- unstructured_ingest-1.0.41/unstructured_ingest/__version__.py +1 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/airtable.py +1 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/astradb.py +5 -2
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/confluence.py +21 -1
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/databricks/volumes.py +7 -5
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/discord.py +1 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +9 -2
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/gitlab.py +7 -6
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/jira.py +1 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/local.py +11 -11
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/milvus.py +81 -7
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/mongodb.py +5 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/connector.py +2 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +8 -9
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/onedrive.py +1 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/outlook.py +3 -2
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/pinecone.py +1 -1
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/salesforce.py +6 -4
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/slack.py +5 -3
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/sql/sql.py +6 -1
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +10 -6
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/html.py +14 -1
- unstructured_ingest-1.0.37/unstructured_ingest/__version__.py +0 -1
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/.gitignore +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/LICENSE.md +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/README.md +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/pyproject.toml +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/README.md +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/base/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/base/cmd.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/base/dest.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/base/importer.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/base/src.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/cli.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/cmds.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/utils/click.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/data_types/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/data_types/entities.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/data_types/file_data.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/azure_openai.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/bedrock.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/huggingface.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/interfaces.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/mixedbreadai.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/octoai.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/openai.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/togetherai.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/vertexai.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/embed/voyageai.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/error.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/errors_v2.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/interfaces/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/interfaces/connector.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/interfaces/downloader.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/interfaces/indexer.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/interfaces/process.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/interfaces/processor.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/interfaces/upload_stager.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/interfaces/uploader.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/logger.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/main.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/otel.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/interfaces.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/otel.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/pipeline.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/steps/download.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/steps/embed.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/steps/filter.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/steps/index.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/steps/partition.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/steps/stage.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/pipeline/steps/upload.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/chunker.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connector_registry.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/chroma.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/github.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/google_drive.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/sharepoint.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/utils.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/vectara.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/embedder.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/filter.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/partitioner.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/uncompress.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/unstructured_api.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/chunking.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/compression.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/constants.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/data_prep.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/dep_check.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/ndjson.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/pydantic_models.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
- {unstructured_ingest-1.0.37 → unstructured_ingest-1.0.41}/unstructured_ingest/utils/table.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.41" # pragma: no cover
|
|
@@ -195,8 +195,10 @@ class AstraDBIndexer(Indexer):
|
|
|
195
195
|
all_ids = self._get_doc_ids()
|
|
196
196
|
ids = list(all_ids)
|
|
197
197
|
id_batches = batch_generator(ids, self.index_config.batch_size)
|
|
198
|
-
|
|
199
198
|
for batch in id_batches:
|
|
199
|
+
batch_items = [BatchItem(identifier=b) for b in batch]
|
|
200
|
+
display_name = (f"{self.index_config.collection_name}-{self.index_config.keyspace}"
|
|
201
|
+
f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]")
|
|
200
202
|
fd = AstraDBBatchFileData(
|
|
201
203
|
connector_type=CONNECTOR_TYPE,
|
|
202
204
|
metadata=FileDataSourceMetadata(
|
|
@@ -206,7 +208,8 @@ class AstraDBIndexer(Indexer):
|
|
|
206
208
|
collection_name=self.index_config.collection_name,
|
|
207
209
|
keyspace=self.index_config.keyspace,
|
|
208
210
|
),
|
|
209
|
-
batch_items=
|
|
211
|
+
batch_items=batch_items,
|
|
212
|
+
display_name=display_name,
|
|
210
213
|
)
|
|
211
214
|
yield fd
|
|
212
215
|
|
|
@@ -33,6 +33,8 @@ from unstructured_ingest.utils.string_and_date_utils import fix_unescaped_unicod
|
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
35
35
|
from atlassian import Confluence
|
|
36
|
+
from bs4 import BeautifulSoup
|
|
37
|
+
from bs4.element import Tag
|
|
36
38
|
|
|
37
39
|
CONNECTOR_TYPE = "confluence"
|
|
38
40
|
|
|
@@ -231,15 +233,33 @@ class ConfluenceIndexer(Indexer):
|
|
|
231
233
|
metadata=metadata,
|
|
232
234
|
additional_metadata=additional_metadata,
|
|
233
235
|
source_identifiers=source_identifiers,
|
|
236
|
+
display_name=source_identifiers.fullpath,
|
|
234
237
|
)
|
|
235
238
|
yield file_data
|
|
236
239
|
|
|
237
240
|
|
|
238
|
-
class ConfluenceDownloaderConfig(
|
|
241
|
+
class ConfluenceDownloaderConfig(HtmlMixin, DownloaderConfig):
|
|
239
242
|
max_num_metadata_permissions: int = Field(
|
|
240
243
|
250, description="Approximate maximum number of permissions included in metadata"
|
|
241
244
|
)
|
|
242
245
|
|
|
246
|
+
@requires_dependencies(["bs4"])
|
|
247
|
+
def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
|
|
248
|
+
from bs4.element import Tag
|
|
249
|
+
|
|
250
|
+
return [
|
|
251
|
+
element
|
|
252
|
+
for element in html_soup.find_all(
|
|
253
|
+
"a",
|
|
254
|
+
attrs={
|
|
255
|
+
"class": "confluence-embedded-file",
|
|
256
|
+
"data-linked-resource-type": "attachment",
|
|
257
|
+
"href": True,
|
|
258
|
+
},
|
|
259
|
+
)
|
|
260
|
+
if isinstance(element, Tag)
|
|
261
|
+
]
|
|
262
|
+
|
|
243
263
|
|
|
244
264
|
@dataclass
|
|
245
265
|
class ConfluenceDownloader(Downloader):
|
|
@@ -133,14 +133,15 @@ class DatabricksVolumesIndexer(Indexer, ABC):
|
|
|
133
133
|
if rel_path.startswith("/"):
|
|
134
134
|
rel_path = rel_path[1:]
|
|
135
135
|
filename = Path(file_info.path).name
|
|
136
|
+
source_identifiers = SourceIdentifiers(
|
|
137
|
+
filename=filename,
|
|
138
|
+
rel_path=rel_path,
|
|
139
|
+
fullpath=file_info.path,
|
|
140
|
+
)
|
|
136
141
|
yield FileData(
|
|
137
142
|
identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
|
|
138
143
|
connector_type=self.connector_type,
|
|
139
|
-
source_identifiers=
|
|
140
|
-
filename=filename,
|
|
141
|
-
rel_path=rel_path,
|
|
142
|
-
fullpath=file_info.path,
|
|
143
|
-
),
|
|
144
|
+
source_identifiers=source_identifiers,
|
|
144
145
|
additional_metadata={
|
|
145
146
|
"catalog": self.index_config.catalog,
|
|
146
147
|
"path": file_info.path,
|
|
@@ -148,6 +149,7 @@ class DatabricksVolumesIndexer(Indexer, ABC):
|
|
|
148
149
|
metadata=FileDataSourceMetadata(
|
|
149
150
|
url=file_info.path, date_modified=str(file_info.modification_time)
|
|
150
151
|
),
|
|
152
|
+
display_name=source_identifiers.fullpath,
|
|
151
153
|
)
|
|
152
154
|
except Exception as e:
|
|
153
155
|
raise self.connection_config.wrap_error(e=e)
|
|
@@ -199,17 +199,24 @@ class ElasticsearchIndexer(Indexer):
|
|
|
199
199
|
all_ids = self._get_doc_ids()
|
|
200
200
|
ids = list(all_ids)
|
|
201
201
|
for batch in batch_generator(ids, self.index_config.batch_size):
|
|
202
|
+
batch_items = [BatchItem(identifier=b) for b in batch]
|
|
203
|
+
url = f"{self.connection_config.hosts[0]}/{self.index_config.index_name}"
|
|
204
|
+
display_name = (
|
|
205
|
+
f"url={url}, batch_size={len(batch_items)} "
|
|
206
|
+
f"ids={batch_items[0].identifier}..{batch_items[-1].identifier}"
|
|
207
|
+
) # noqa: E501
|
|
202
208
|
# Make sure the hash is always a positive number to create identified
|
|
203
209
|
yield ElasticsearchBatchFileData(
|
|
204
210
|
connector_type=CONNECTOR_TYPE,
|
|
205
211
|
metadata=FileDataSourceMetadata(
|
|
206
|
-
url=
|
|
212
|
+
url=url,
|
|
207
213
|
date_processed=str(time()),
|
|
208
214
|
),
|
|
209
215
|
additional_metadata=ElastisearchAdditionalMetadata(
|
|
210
216
|
index_name=self.index_config.index_name,
|
|
211
217
|
),
|
|
212
|
-
batch_items=
|
|
218
|
+
batch_items=batch_items,
|
|
219
|
+
display_name=display_name,
|
|
213
220
|
)
|
|
214
221
|
|
|
215
222
|
|
|
@@ -190,21 +190,22 @@ class GitLabIndexer(Indexer):
|
|
|
190
190
|
"file_path": file["path"],
|
|
191
191
|
"ref": ref,
|
|
192
192
|
}
|
|
193
|
-
|
|
193
|
+
source_identifiers = SourceIdentifiers(
|
|
194
|
+
fullpath=file["path"],
|
|
195
|
+
filename=Path(file["path"]).name,
|
|
196
|
+
rel_path=relative_path,
|
|
197
|
+
)
|
|
194
198
|
yield FileData(
|
|
195
199
|
identifier=file["id"],
|
|
196
200
|
connector_type=CONNECTOR_TYPE,
|
|
197
|
-
source_identifiers=
|
|
198
|
-
fullpath=file["path"],
|
|
199
|
-
filename=Path(file["path"]).name,
|
|
200
|
-
rel_path=relative_path,
|
|
201
|
-
),
|
|
201
|
+
source_identifiers=source_identifiers,
|
|
202
202
|
metadata=FileDataSourceMetadata(
|
|
203
203
|
url=file["id"],
|
|
204
204
|
record_locator=record_locator,
|
|
205
205
|
permissions_data=[{"mode": file["mode"]}],
|
|
206
206
|
),
|
|
207
207
|
additional_metadata={},
|
|
208
|
+
display_name=source_identifiers.fullpath,
|
|
208
209
|
)
|
|
209
210
|
|
|
210
211
|
|
|
@@ -119,21 +119,21 @@ class LocalIndexer(Indexer):
|
|
|
119
119
|
|
|
120
120
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
121
121
|
for file_path in self.list_files():
|
|
122
|
+
source_identifiers = SourceIdentifiers(
|
|
123
|
+
fullpath=str(file_path.resolve()),
|
|
124
|
+
filename=file_path.name,
|
|
125
|
+
rel_path=(
|
|
126
|
+
str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[1:]
|
|
127
|
+
if not self.index_config.path.is_file()
|
|
128
|
+
else self.index_config.path.name
|
|
129
|
+
),
|
|
130
|
+
)
|
|
122
131
|
file_data = FileData(
|
|
123
132
|
identifier=str(file_path.resolve()),
|
|
124
133
|
connector_type=CONNECTOR_TYPE,
|
|
125
|
-
source_identifiers=
|
|
126
|
-
fullpath=str(file_path.resolve()),
|
|
127
|
-
filename=file_path.name,
|
|
128
|
-
rel_path=(
|
|
129
|
-
str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[
|
|
130
|
-
1:
|
|
131
|
-
]
|
|
132
|
-
if not self.index_config.path.is_file()
|
|
133
|
-
else self.index_config.path.name
|
|
134
|
-
),
|
|
135
|
-
),
|
|
134
|
+
source_identifiers=source_identifiers,
|
|
136
135
|
metadata=self.get_file_metadata(path=file_path),
|
|
136
|
+
display_name=source_identifiers.fullpath,
|
|
137
137
|
)
|
|
138
138
|
yield file_data
|
|
139
139
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
6
|
from dateutil import parser
|
|
7
7
|
from pydantic import Field, Secret
|
|
@@ -97,10 +97,16 @@ class MilvusUploadStager(UploadStager):
|
|
|
97
97
|
|
|
98
98
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
99
99
|
working_data = element_dict.copy()
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
100
|
+
|
|
101
|
+
if self.upload_stager_config.flatten_metadata:
|
|
102
|
+
metadata: dict[str, Any] = working_data.pop("metadata", {})
|
|
103
|
+
flattened_metadata = flatten_dict(
|
|
104
|
+
metadata,
|
|
105
|
+
separator="_",
|
|
106
|
+
flatten_lists=False,
|
|
107
|
+
remove_none=True,
|
|
108
|
+
)
|
|
109
|
+
working_data.update(flattened_metadata)
|
|
104
110
|
|
|
105
111
|
# TODO: milvus sdk doesn't seem to support defaults via the schema yet,
|
|
106
112
|
# remove once that gets updated
|
|
@@ -154,6 +160,23 @@ class MilvusUploader(Uploader):
|
|
|
154
160
|
upload_config: MilvusUploaderConfig
|
|
155
161
|
connector_type: str = CONNECTOR_TYPE
|
|
156
162
|
|
|
163
|
+
def has_dynamic_fields_enabled(self) -> bool:
|
|
164
|
+
"""Check if the target collection has dynamic fields enabled."""
|
|
165
|
+
try:
|
|
166
|
+
with self.get_client() as client:
|
|
167
|
+
collection_info = client.describe_collection(self.upload_config.collection_name)
|
|
168
|
+
|
|
169
|
+
# Check if dynamic field is enabled
|
|
170
|
+
# The schema info should contain enable_dynamic_field or enableDynamicField
|
|
171
|
+
schema_info = collection_info.get(
|
|
172
|
+
"enable_dynamic_field",
|
|
173
|
+
collection_info.get("enableDynamicField", False),
|
|
174
|
+
)
|
|
175
|
+
return bool(schema_info)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.warning(f"Could not determine if collection has dynamic fields enabled: {e}")
|
|
178
|
+
return False
|
|
179
|
+
|
|
157
180
|
@DestinationConnectionError.wrap
|
|
158
181
|
def precheck(self):
|
|
159
182
|
from pymilvus import MilvusException
|
|
@@ -164,6 +187,7 @@ class MilvusUploader(Uploader):
|
|
|
164
187
|
raise DestinationConnectionError(
|
|
165
188
|
f"Collection '{self.upload_config.collection_name}' does not exist"
|
|
166
189
|
)
|
|
190
|
+
|
|
167
191
|
except MilvusException as milvus_exception:
|
|
168
192
|
raise DestinationConnectionError(
|
|
169
193
|
f"failed to precheck Milvus: {str(milvus_exception.message)}"
|
|
@@ -193,16 +217,66 @@ class MilvusUploader(Uploader):
|
|
|
193
217
|
)
|
|
194
218
|
|
|
195
219
|
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
196
|
-
def
|
|
220
|
+
def _prepare_data_for_insert(self, data: list[dict]) -> list[dict]:
|
|
221
|
+
"""
|
|
222
|
+
Conforms the provided data to the schema of the target Milvus collection.
|
|
223
|
+
- If dynamic fields are enabled, it ensures JSON-stringified fields are decoded.
|
|
224
|
+
- If dynamic fields are disabled, it filters out any fields not present in the schema.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
dynamic_fields_enabled = self.has_dynamic_fields_enabled()
|
|
228
|
+
|
|
229
|
+
# If dynamic fields are enabled, 'languages' field needs to be a list
|
|
230
|
+
if dynamic_fields_enabled:
|
|
231
|
+
logger.debug("Dynamic fields enabled, ensuring 'languages' field is a list.")
|
|
232
|
+
prepared_data = []
|
|
233
|
+
for item in data:
|
|
234
|
+
new_item = item.copy()
|
|
235
|
+
if "languages" in new_item and isinstance(new_item["languages"], str):
|
|
236
|
+
try:
|
|
237
|
+
new_item["languages"] = json.loads(new_item["languages"])
|
|
238
|
+
except (json.JSONDecodeError, TypeError):
|
|
239
|
+
logger.warning(
|
|
240
|
+
f"Could not JSON decode languages field: {new_item['languages']}. "
|
|
241
|
+
"Leaving as string.",
|
|
242
|
+
)
|
|
243
|
+
prepared_data.append(new_item)
|
|
244
|
+
return prepared_data
|
|
245
|
+
|
|
246
|
+
# If dynamic fields are not enabled, we need to filter out the metadata fields
|
|
247
|
+
# to avoid insertion errors for fields not defined in the schema
|
|
248
|
+
with self.get_client() as client:
|
|
249
|
+
collection_info = client.describe_collection(
|
|
250
|
+
self.upload_config.collection_name,
|
|
251
|
+
)
|
|
252
|
+
schema_fields = {
|
|
253
|
+
field["name"]
|
|
254
|
+
for field in collection_info.get("fields", [])
|
|
255
|
+
if not field.get("auto_id", False)
|
|
256
|
+
}
|
|
257
|
+
# Remove metadata fields that are not part of the base schema
|
|
258
|
+
filtered_data = []
|
|
259
|
+
for item in data:
|
|
260
|
+
filtered_item = {key: value for key, value in item.items() if key in schema_fields}
|
|
261
|
+
filtered_data.append(filtered_item)
|
|
262
|
+
return filtered_data
|
|
263
|
+
|
|
264
|
+
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
265
|
+
def insert_results(self, data: list[dict]):
|
|
197
266
|
from pymilvus import MilvusException
|
|
198
267
|
|
|
199
268
|
logger.info(
|
|
200
269
|
f"uploading {len(data)} entries to {self.connection_config.db_name} "
|
|
201
270
|
f"db in collection {self.upload_config.collection_name}"
|
|
202
271
|
)
|
|
272
|
+
|
|
273
|
+
prepared_data = self._prepare_data_for_insert(data=data)
|
|
274
|
+
|
|
203
275
|
with self.get_client() as client:
|
|
204
276
|
try:
|
|
205
|
-
res = client.insert(
|
|
277
|
+
res = client.insert(
|
|
278
|
+
collection_name=self.upload_config.collection_name, data=prepared_data
|
|
279
|
+
)
|
|
206
280
|
except MilvusException as milvus_exception:
|
|
207
281
|
raise WriteError(
|
|
208
282
|
f"failed to upload records to Milvus: {str(milvus_exception.message)}"
|
|
@@ -149,6 +149,10 @@ class MongoDBIndexer(Indexer):
|
|
|
149
149
|
|
|
150
150
|
for id_batch in batch_generator(ids, batch_size=batch_size):
|
|
151
151
|
# Make sure the hash is always a positive number to create identifier
|
|
152
|
+
display_name = (
|
|
153
|
+
f"{self.index_config.database}.{self.index_config.collection}, "
|
|
154
|
+
f"batch {id_batch[0]}-{id_batch[-1]}"
|
|
155
|
+
)
|
|
152
156
|
metadata = FileDataSourceMetadata(
|
|
153
157
|
date_processed=str(time()),
|
|
154
158
|
record_locator={
|
|
@@ -164,6 +168,7 @@ class MongoDBIndexer(Indexer):
|
|
|
164
168
|
additional_metadata=MongoDBAdditionalMetadata(
|
|
165
169
|
collection=self.index_config.collection, database=self.index_config.database
|
|
166
170
|
),
|
|
171
|
+
display_name=display_name,
|
|
167
172
|
)
|
|
168
173
|
yield file_data
|
|
169
174
|
|
|
@@ -174,6 +174,7 @@ class NotionIndexer(Indexer):
|
|
|
174
174
|
source_identifiers=source_identifiers,
|
|
175
175
|
metadata=metadata,
|
|
176
176
|
additional_metadata=additional_metadata,
|
|
177
|
+
display_name=source_identifiers.fullpath,
|
|
177
178
|
)
|
|
178
179
|
except Exception as e:
|
|
179
180
|
logger.error(f"Error retrieving page {page_id}: {e}")
|
|
@@ -210,6 +211,7 @@ class NotionIndexer(Indexer):
|
|
|
210
211
|
source_identifiers=source_identifiers,
|
|
211
212
|
metadata=metadata,
|
|
212
213
|
additional_metadata=additional_metadata,
|
|
214
|
+
display_name=source_identifiers.fullpath,
|
|
213
215
|
)
|
|
214
216
|
except Exception as e:
|
|
215
217
|
logger.error(f"Error retrieving database {database_id}: {e}")
|
|
@@ -19,11 +19,11 @@ class OriginalSyncedBlock(BlockBase):
|
|
|
19
19
|
@classmethod
|
|
20
20
|
def from_dict(cls, data: dict):
|
|
21
21
|
"""Create OriginalSyncedBlock from dictionary data.
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
Original blocks contain children content.
|
|
24
24
|
"""
|
|
25
25
|
if "children" not in data:
|
|
26
|
-
|
|
26
|
+
raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
|
|
27
27
|
return cls(children=data["children"])
|
|
28
28
|
|
|
29
29
|
def get_html(self) -> Optional[HtmlTag]:
|
|
@@ -38,7 +38,7 @@ class DuplicateSyncedBlock(BlockBase):
|
|
|
38
38
|
@staticmethod
|
|
39
39
|
def can_have_children() -> bool:
|
|
40
40
|
"""Check if duplicate synced blocks can have children.
|
|
41
|
-
|
|
41
|
+
|
|
42
42
|
Duplicate blocks themselves don't have children directly fetched here,
|
|
43
43
|
but they represent content that does, so Notion API might report has_children=True
|
|
44
44
|
on the parent block object. The actual children are fetched from the original block.
|
|
@@ -48,7 +48,7 @@ class DuplicateSyncedBlock(BlockBase):
|
|
|
48
48
|
@classmethod
|
|
49
49
|
def from_dict(cls, data: dict):
|
|
50
50
|
"""Create DuplicateSyncedBlock from dictionary data.
|
|
51
|
-
|
|
51
|
+
|
|
52
52
|
Duplicate blocks contain a 'synced_from' reference.
|
|
53
53
|
"""
|
|
54
54
|
synced_from_data = data.get("synced_from")
|
|
@@ -63,7 +63,7 @@ class DuplicateSyncedBlock(BlockBase):
|
|
|
63
63
|
|
|
64
64
|
def get_html(self) -> Optional[HtmlTag]:
|
|
65
65
|
"""Get HTML representation of the duplicate synced block.
|
|
66
|
-
|
|
66
|
+
|
|
67
67
|
HTML representation might need fetching the original block's content,
|
|
68
68
|
which is outside the scope of this simple data class.
|
|
69
69
|
"""
|
|
@@ -74,7 +74,7 @@ class SyncBlock(BlockBase):
|
|
|
74
74
|
@staticmethod
|
|
75
75
|
def can_have_children() -> bool:
|
|
76
76
|
"""Check if synced blocks can have children.
|
|
77
|
-
|
|
77
|
+
|
|
78
78
|
Synced blocks (both original and duplicate) can conceptually have children.
|
|
79
79
|
"""
|
|
80
80
|
return True
|
|
@@ -82,7 +82,7 @@ class SyncBlock(BlockBase):
|
|
|
82
82
|
@classmethod
|
|
83
83
|
def from_dict(cls, data: dict):
|
|
84
84
|
"""Create appropriate SyncedBlock subclass from dictionary data.
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
Determine if it's a duplicate (has 'synced_from') or original (has 'children').
|
|
87
87
|
"""
|
|
88
88
|
if data.get("synced_from") is not None:
|
|
@@ -99,10 +99,9 @@ class SyncBlock(BlockBase):
|
|
|
99
99
|
# Consider logging a warning here if strictness is needed.
|
|
100
100
|
return OriginalSyncedBlock(children=[])
|
|
101
101
|
|
|
102
|
-
|
|
103
102
|
def get_html(self) -> Optional[HtmlTag]:
|
|
104
103
|
"""Get HTML representation of the synced block.
|
|
105
|
-
|
|
104
|
+
|
|
106
105
|
The specific instance returned by from_dict (Original or Duplicate)
|
|
107
106
|
will handle its own get_html logic.
|
|
108
107
|
This method on the base SyncBlock might not be directly called.
|
|
@@ -149,11 +149,11 @@ class OutlookIndexer(Indexer):
|
|
|
149
149
|
|
|
150
150
|
def _message_to_file_data(self, message: "Message") -> FileData:
|
|
151
151
|
fullpath = self._generate_fullpath(message)
|
|
152
|
-
|
|
152
|
+
source_identifiers = SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath))
|
|
153
153
|
return FileData(
|
|
154
154
|
identifier=message.id,
|
|
155
155
|
connector_type=CONNECTOR_TYPE,
|
|
156
|
-
source_identifiers=
|
|
156
|
+
source_identifiers=source_identifiers,
|
|
157
157
|
metadata=FileDataSourceMetadata(
|
|
158
158
|
url=message.resource_url,
|
|
159
159
|
version=message.change_key,
|
|
@@ -178,6 +178,7 @@ class OutlookIndexer(Indexer):
|
|
|
178
178
|
"has_attachments": message.has_attachments,
|
|
179
179
|
"importance": message.importance,
|
|
180
180
|
},
|
|
181
|
+
display_name=source_identifiers.fullpath,
|
|
181
182
|
)
|
|
182
183
|
|
|
183
184
|
def _generate_fullpath(self, message: "Message") -> Path:
|
|
@@ -240,7 +240,7 @@ class PineconeUploader(VectorDBUploader):
|
|
|
240
240
|
destination_name: str = "unstructuredautocreated",
|
|
241
241
|
destination_type: Literal["pod", "serverless"] = "serverless",
|
|
242
242
|
serverless_cloud: str = "aws",
|
|
243
|
-
serverless_region: str = "us-
|
|
243
|
+
serverless_region: str = "us-east-1",
|
|
244
244
|
pod_environment: str = "us-east1-gcp",
|
|
245
245
|
pod_type: str = "p1.x1",
|
|
246
246
|
pod_count: int = 1,
|
|
@@ -182,14 +182,15 @@ class SalesforceIndexer(Indexer):
|
|
|
182
182
|
record_with_extension = record["Id"] + self.get_file_extension(
|
|
183
183
|
record["attributes"]["type"]
|
|
184
184
|
)
|
|
185
|
+
source_identifiers = SourceIdentifiers(
|
|
186
|
+
filename=record_with_extension,
|
|
187
|
+
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
188
|
+
)
|
|
185
189
|
files_list.append(
|
|
186
190
|
FileData(
|
|
187
191
|
connector_type=CONNECTOR_TYPE,
|
|
188
192
|
identifier=record["Id"],
|
|
189
|
-
source_identifiers=
|
|
190
|
-
filename=record_with_extension,
|
|
191
|
-
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
192
|
-
),
|
|
193
|
+
source_identifiers=source_identifiers,
|
|
193
194
|
metadata=FileDataSourceMetadata(
|
|
194
195
|
url=record["attributes"]["url"],
|
|
195
196
|
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
|
|
@@ -200,6 +201,7 @@ class SalesforceIndexer(Indexer):
|
|
|
200
201
|
record_locator={"id": record["Id"]},
|
|
201
202
|
),
|
|
202
203
|
additional_metadata={"record_type": record["attributes"]["type"]},
|
|
204
|
+
display_name=source_identifiers.fullpath,
|
|
203
205
|
)
|
|
204
206
|
)
|
|
205
207
|
except SalesforceMalformedRequest as e:
|
|
@@ -122,12 +122,13 @@ class SlackIndexer(Indexer):
|
|
|
122
122
|
identifier = hashlib.sha256(identifier_base.encode("utf-8")).hexdigest()
|
|
123
123
|
filename = identifier[:16]
|
|
124
124
|
|
|
125
|
+
source_identifiers = SourceIdentifiers(
|
|
126
|
+
filename=f"{filename}.xml", fullpath=f"{filename}.xml"
|
|
127
|
+
)
|
|
125
128
|
return FileData(
|
|
126
129
|
identifier=identifier,
|
|
127
130
|
connector_type=CONNECTOR_TYPE,
|
|
128
|
-
source_identifiers=
|
|
129
|
-
filename=f"{filename}.xml", fullpath=f"{filename}.xml"
|
|
130
|
-
),
|
|
131
|
+
source_identifiers=source_identifiers,
|
|
131
132
|
metadata=FileDataSourceMetadata(
|
|
132
133
|
date_created=ts_oldest,
|
|
133
134
|
date_modified=ts_newest,
|
|
@@ -138,6 +139,7 @@ class SlackIndexer(Indexer):
|
|
|
138
139
|
"latest": ts_newest,
|
|
139
140
|
},
|
|
140
141
|
),
|
|
142
|
+
display_name=source_identifiers.fullpath,
|
|
141
143
|
)
|
|
142
144
|
|
|
143
145
|
@SourceConnectionError.wrap
|
|
@@ -130,7 +130,11 @@ class SQLIndexer(Indexer, ABC):
|
|
|
130
130
|
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
131
131
|
)
|
|
132
132
|
]
|
|
133
|
+
|
|
133
134
|
for batch in id_batches:
|
|
135
|
+
batch_items = [BatchItem(identifier=str(b)) for b in batch]
|
|
136
|
+
display_name = (f"{self.index_config.table_name}-{self.index_config.id_column}"
|
|
137
|
+
f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]")
|
|
134
138
|
# Make sure the hash is always a positive number to create identified
|
|
135
139
|
yield SqlBatchFileData(
|
|
136
140
|
connector_type=self.connector_type,
|
|
@@ -140,7 +144,8 @@ class SQLIndexer(Indexer, ABC):
|
|
|
140
144
|
additional_metadata=SqlAdditionalMetadata(
|
|
141
145
|
table_name=self.index_config.table_name, id_column=self.index_config.id_column
|
|
142
146
|
),
|
|
143
|
-
batch_items=
|
|
147
|
+
batch_items=batch_items,
|
|
148
|
+
display_name=display_name
|
|
144
149
|
)
|
|
145
150
|
|
|
146
151
|
|
|
@@ -86,12 +86,13 @@ class ZendeskIndexer(Indexer):
|
|
|
86
86
|
async def get_tickets(self) -> AsyncGenerator[ZendeskFileData, None]:
|
|
87
87
|
async with self.connection_config.get_client() as client:
|
|
88
88
|
async for ticket in client.get_tickets():
|
|
89
|
+
source_identifiers = SourceIdentifiers(
|
|
90
|
+
filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
|
|
91
|
+
)
|
|
89
92
|
yield ZendeskFileData(
|
|
90
93
|
identifier=str(ticket.id),
|
|
91
94
|
connector_type=self.connector_type,
|
|
92
|
-
source_identifiers=
|
|
93
|
-
filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
|
|
94
|
-
),
|
|
95
|
+
source_identifiers=source_identifiers,
|
|
95
96
|
additional_metadata=ZendeskAdditionalMetadata(
|
|
96
97
|
item_type="ticket", content=ticket
|
|
97
98
|
),
|
|
@@ -101,17 +102,19 @@ class ZendeskIndexer(Indexer):
|
|
|
101
102
|
date_modified=ticket.updated_at.isoformat() if ticket.updated_at else None,
|
|
102
103
|
date_processed=str(time()),
|
|
103
104
|
),
|
|
105
|
+
display_name=source_identifiers.fullpath,
|
|
104
106
|
)
|
|
105
107
|
|
|
106
108
|
async def get_articles(self) -> AsyncGenerator[ZendeskFileData, None]:
|
|
107
109
|
async with self.connection_config.get_client() as client:
|
|
108
110
|
async for article in client.get_articles():
|
|
111
|
+
source_identifiers = SourceIdentifiers(
|
|
112
|
+
filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
|
|
113
|
+
)
|
|
109
114
|
yield ZendeskFileData(
|
|
110
115
|
identifier=str(article.id),
|
|
111
116
|
connector_type=self.connector_type,
|
|
112
|
-
source_identifiers=
|
|
113
|
-
filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
|
|
114
|
-
),
|
|
117
|
+
source_identifiers=source_identifiers,
|
|
115
118
|
additional_metadata=ZendeskAdditionalMetadata(
|
|
116
119
|
item_type="article", content=article
|
|
117
120
|
),
|
|
@@ -123,6 +126,7 @@ class ZendeskIndexer(Indexer):
|
|
|
123
126
|
),
|
|
124
127
|
date_processed=str(time()),
|
|
125
128
|
),
|
|
129
|
+
display_name=source_identifiers.fullpath,
|
|
126
130
|
)
|
|
127
131
|
|
|
128
132
|
async def run_async(self, **kwargs: Any) -> AsyncGenerator[ZendeskFileData, None]:
|