unstructured-ingest 1.0.56__tar.gz → 1.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/PKG-INFO +2 -1
- unstructured_ingest-1.1.2/unstructured_ingest/__version__.py +1 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/azure_openai.py +7 -2
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/openai.py +11 -4
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/interfaces/connector.py +7 -1
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/otel.py +16 -1
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/delta_table.py +2 -1
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +106 -14
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/fsspec/s3.py +14 -4
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/onedrive.py +9 -8
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest-1.1.2/unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest-1.1.2/unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest-1.1.2/unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/__init__.py +1 -1
- unstructured_ingest-1.1.2/unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.0.56/unstructured_ingest/__version__.py +0 -1
- unstructured_ingest-1.0.56/unstructured_ingest/processes/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/.gitignore +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/LICENSE.md +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/README.md +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/pyproject.toml +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/README.md +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/base/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/base/cmd.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/base/dest.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/base/importer.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/base/src.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/cli.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/cmds.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/utils/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/utils/click.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/data_types/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/data_types/entities.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/data_types/file_data.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/bedrock.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/huggingface.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/interfaces.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/mixedbreadai.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/octoai.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/togetherai.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/vertexai.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/voyageai.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/error.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/errors_v2.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/interfaces/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/interfaces/downloader.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/interfaces/indexer.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/interfaces/process.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/interfaces/processor.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/interfaces/upload_stager.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/interfaces/uploader.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/logger.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/main.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/interfaces.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/otel.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/pipeline.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/steps/download.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/steps/embed.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/steps/filter.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/steps/index.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/steps/partition.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/steps/stage.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/pipeline/steps/upload.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/chunker.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connector_registry.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/airtable.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/astradb.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/chroma.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/confluence.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/discord.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/github.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/google_drive.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/jira.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/local.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/milvus.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/outlook.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/pinecone.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/slack.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/utils.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/vectara.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/embedder.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/filter.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/partitioner.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/uncompress.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/unstructured_api.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/chunking.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/compression.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/constants.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/data_prep.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/dep_check.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/filesystem.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/html.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/ndjson.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/pydantic_models.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
- {unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/utils/table.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unstructured_ingest
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.2
|
|
4
4
|
Summary: Local ETL data pipeline to get data RAG ready
|
|
5
5
|
Author-email: Unstructured Technologies <devops@unstructuredai.io>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
20
|
Requires-Python: <3.13,>=3.9
|
|
21
|
+
Requires-Dist: certifi>=2025.7.14
|
|
21
22
|
Requires-Dist: click
|
|
22
23
|
Requires-Dist: opentelemetry-sdk
|
|
23
24
|
Requires-Dist: pydantic>=2.7
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.2" # pragma: no cover
|
{unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/azure_openai.py
RENAMED
|
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.openai import (
|
|
|
9
9
|
OpenAIEmbeddingEncoder,
|
|
10
10
|
)
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.utils.tls import ssl_context_with_optional_ca_override
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
14
15
|
from openai import AsyncAzureOpenAI, AzureOpenAI
|
|
@@ -23,9 +24,11 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
|
|
|
23
24
|
|
|
24
25
|
@requires_dependencies(["openai"], extras="openai")
|
|
25
26
|
def get_client(self) -> "AzureOpenAI":
|
|
26
|
-
from openai import AzureOpenAI
|
|
27
|
+
from openai import AzureOpenAI, DefaultHttpxClient
|
|
27
28
|
|
|
29
|
+
client = DefaultHttpxClient(verify=ssl_context_with_optional_ca_override())
|
|
28
30
|
return AzureOpenAI(
|
|
31
|
+
http_client=client,
|
|
29
32
|
api_key=self.api_key.get_secret_value(),
|
|
30
33
|
api_version=self.api_version,
|
|
31
34
|
azure_endpoint=self.azure_endpoint,
|
|
@@ -33,9 +36,11 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
|
|
|
33
36
|
|
|
34
37
|
@requires_dependencies(["openai"], extras="openai")
|
|
35
38
|
def get_async_client(self) -> "AsyncAzureOpenAI":
|
|
36
|
-
from openai import AsyncAzureOpenAI
|
|
39
|
+
from openai import AsyncAzureOpenAI, DefaultAsyncHttpxClient
|
|
37
40
|
|
|
41
|
+
client = DefaultAsyncHttpxClient(verify=ssl_context_with_optional_ca_override())
|
|
38
42
|
return AsyncAzureOpenAI(
|
|
43
|
+
http_client=client,
|
|
39
44
|
api_key=self.api_key.get_secret_value(),
|
|
40
45
|
api_version=self.api_version,
|
|
41
46
|
azure_endpoint=self.azure_endpoint,
|
{unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/embed/openai.py
RENAMED
|
@@ -18,6 +18,7 @@ from unstructured_ingest.errors_v2 import (
|
|
|
18
18
|
)
|
|
19
19
|
from unstructured_ingest.logger import logger
|
|
20
20
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
|
+
from unstructured_ingest.utils.tls import ssl_context_with_optional_ca_override
|
|
21
22
|
|
|
22
23
|
if TYPE_CHECKING:
|
|
23
24
|
from openai import AsyncOpenAI, OpenAI
|
|
@@ -86,15 +87,21 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
|
86
87
|
|
|
87
88
|
@requires_dependencies(["openai"], extras="openai")
|
|
88
89
|
def get_client(self) -> "OpenAI":
|
|
89
|
-
from openai import OpenAI
|
|
90
|
+
from openai import DefaultHttpxClient, OpenAI
|
|
90
91
|
|
|
91
|
-
|
|
92
|
+
client = DefaultHttpxClient(verify=ssl_context_with_optional_ca_override())
|
|
93
|
+
return OpenAI(
|
|
94
|
+
api_key=self.api_key.get_secret_value(), http_client=client, base_url=self.base_url
|
|
95
|
+
)
|
|
92
96
|
|
|
93
97
|
@requires_dependencies(["openai"], extras="openai")
|
|
94
98
|
def get_async_client(self) -> "AsyncOpenAI":
|
|
95
|
-
from openai import AsyncOpenAI
|
|
99
|
+
from openai import AsyncOpenAI, DefaultAsyncHttpxClient
|
|
96
100
|
|
|
97
|
-
|
|
101
|
+
client = DefaultAsyncHttpxClient(verify=ssl_context_with_optional_ca_override())
|
|
102
|
+
return AsyncOpenAI(
|
|
103
|
+
api_key=self.api_key.get_secret_value(), http_client=client, base_url=self.base_url
|
|
104
|
+
)
|
|
98
105
|
|
|
99
106
|
|
|
100
107
|
@dataclass
|
{unstructured_ingest-1.0.56 → unstructured_ingest-1.1.2}/unstructured_ingest/interfaces/connector.py
RENAMED
|
@@ -5,6 +5,8 @@ from typing import Any, TypeVar, Union
|
|
|
5
5
|
from pydantic import BaseModel, Secret, model_validator
|
|
6
6
|
from pydantic.types import _SecretBase
|
|
7
7
|
|
|
8
|
+
from unstructured_ingest.processes.utils.logging.connector import ConnectorLoggingMixin
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
class AccessConfig(BaseModel):
|
|
10
12
|
"""Meant to designate holding any sensitive information associated with other configs
|
|
@@ -46,5 +48,9 @@ ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
|
|
|
46
48
|
|
|
47
49
|
|
|
48
50
|
@dataclass
|
|
49
|
-
class BaseConnector(ABC):
|
|
51
|
+
class BaseConnector(ABC, ConnectorLoggingMixin):
|
|
50
52
|
connection_config: ConnectionConfigT
|
|
53
|
+
|
|
54
|
+
def __post_init__(self):
|
|
55
|
+
"""Initialize the logging mixin after dataclass initialization."""
|
|
56
|
+
ConnectorLoggingMixin.__init__(self)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import os
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from typing import Callable, ClassVar, Optional, Protocol, Sequence
|
|
@@ -31,13 +32,27 @@ class LogSpanExporter(ConsoleSpanExporter):
|
|
|
31
32
|
self.log_out(self.formatter(span))
|
|
32
33
|
return SpanExportResult.SUCCESS
|
|
33
34
|
|
|
35
|
+
def get_log_out() -> Callable:
|
|
36
|
+
level_names_mapping = {
|
|
37
|
+
'CRITICAL': logging.CRITICAL,
|
|
38
|
+
'FATAL': logging.FATAL,
|
|
39
|
+
'ERROR': logging.ERROR,
|
|
40
|
+
'WARN': logging.WARNING,
|
|
41
|
+
'WARNING': logging.WARNING,
|
|
42
|
+
'INFO': logging.INFO,
|
|
43
|
+
'DEBUG': logging.DEBUG,
|
|
44
|
+
'NOTSET': logging.NOTSET,
|
|
45
|
+
}
|
|
46
|
+
log_level = os.getenv("OTEL_LOG_LEVEL", "DEBUG").upper()
|
|
47
|
+
log_level_int = level_names_mapping.get(log_level, logging.DEBUG)
|
|
48
|
+
return lambda message: logger.log(log_level_int, message)
|
|
34
49
|
|
|
35
50
|
@dataclass
|
|
36
51
|
class OtelHandler:
|
|
37
52
|
otel_endpoint: Optional[str] = None
|
|
38
53
|
service_name: str = "unstructured-ingest"
|
|
39
54
|
trace_provider: TracerProvider = field(init=False)
|
|
40
|
-
log_out: Callable = field(default=
|
|
55
|
+
log_out: Callable = field(default=get_log_out())
|
|
41
56
|
trace_context_key: ClassVar[str] = "_trace_context"
|
|
42
57
|
|
|
43
58
|
def init_trace(self):
|
|
@@ -48,7 +48,8 @@ class DeltaTableAccessConfig(AccessConfig):
|
|
|
48
48
|
|
|
49
49
|
class DeltaTableConnectionConfig(ConnectionConfig):
|
|
50
50
|
access_config: Secret[DeltaTableAccessConfig] = Field(
|
|
51
|
-
|
|
51
|
+
default_factory=lambda: Secret[DeltaTableAccessConfig](DeltaTableAccessConfig()),
|
|
52
|
+
validate_default=True,
|
|
52
53
|
)
|
|
53
54
|
aws_region: Optional[str] = Field(default=None, description="AWS Region")
|
|
54
55
|
table_uri: str = Field(
|
|
@@ -28,7 +28,6 @@ from unstructured_ingest.interfaces import (
|
|
|
28
28
|
Uploader,
|
|
29
29
|
UploaderConfig,
|
|
30
30
|
)
|
|
31
|
-
from unstructured_ingest.logger import logger
|
|
32
31
|
from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
|
|
33
32
|
from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
|
|
34
33
|
|
|
@@ -106,6 +105,12 @@ class FsspecIndexer(Indexer):
|
|
|
106
105
|
def precheck(self) -> None:
|
|
107
106
|
from fsspec import get_filesystem_class
|
|
108
107
|
|
|
108
|
+
self.log_operation_start(
|
|
109
|
+
"Connection validation",
|
|
110
|
+
protocol=self.index_config.protocol,
|
|
111
|
+
path=self.index_config.path_without_protocol,
|
|
112
|
+
)
|
|
113
|
+
|
|
109
114
|
try:
|
|
110
115
|
fs = get_filesystem_class(self.index_config.protocol)(
|
|
111
116
|
**self.connection_config.get_access_config(),
|
|
@@ -113,13 +118,24 @@ class FsspecIndexer(Indexer):
|
|
|
113
118
|
files = fs.ls(path=self.index_config.path_without_protocol, detail=True)
|
|
114
119
|
valid_files = [x.get("name") for x in files if x.get("type") == "file"]
|
|
115
120
|
if not valid_files:
|
|
121
|
+
self.log_operation_complete("Connection validation", count=0)
|
|
116
122
|
return
|
|
117
123
|
file_to_sample = valid_files[0]
|
|
118
|
-
|
|
124
|
+
self.log_debug(f"attempting to make HEAD request for file: {file_to_sample}")
|
|
119
125
|
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
120
126
|
client.head(path=file_to_sample)
|
|
127
|
+
|
|
128
|
+
self.log_connection_validated(
|
|
129
|
+
connector_type=self.connector_type,
|
|
130
|
+
endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
|
|
131
|
+
)
|
|
132
|
+
|
|
121
133
|
except Exception as e:
|
|
122
|
-
|
|
134
|
+
self.log_connection_failed(
|
|
135
|
+
connector_type=self.connector_type,
|
|
136
|
+
error=e,
|
|
137
|
+
endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
|
|
138
|
+
)
|
|
123
139
|
raise self.wrap_error(e=e)
|
|
124
140
|
|
|
125
141
|
def get_file_info(self) -> list[dict[str, Any]]:
|
|
@@ -150,7 +166,7 @@ class FsspecIndexer(Indexer):
|
|
|
150
166
|
|
|
151
167
|
def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
|
|
152
168
|
if len(files) <= n:
|
|
153
|
-
|
|
169
|
+
self.log_warning(
|
|
154
170
|
f"number of files to be sampled={n} is not smaller than the number"
|
|
155
171
|
f" of files found ({len(files)}). Returning all of the files as the"
|
|
156
172
|
" sample."
|
|
@@ -201,9 +217,22 @@ class FsspecIndexer(Indexer):
|
|
|
201
217
|
init_file_data.additional_metadata = self.get_metadata(file_info=file_info)
|
|
202
218
|
|
|
203
219
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
220
|
+
self.log_indexing_start(f"{self.connector_type} files")
|
|
221
|
+
|
|
204
222
|
files = self.get_file_info()
|
|
205
|
-
|
|
223
|
+
total_files = len(files)
|
|
224
|
+
|
|
225
|
+
self.log_operation_start("File indexing", total_files=total_files)
|
|
226
|
+
|
|
227
|
+
for i, file_info in enumerate(files):
|
|
206
228
|
file_path = self.get_path(file_info=file_info)
|
|
229
|
+
|
|
230
|
+
# Only log progress for larger operations
|
|
231
|
+
if total_files > 5:
|
|
232
|
+
self.log_progress(
|
|
233
|
+
current=i + 1, total=total_files, item_type="files", operation="Indexing"
|
|
234
|
+
)
|
|
235
|
+
|
|
207
236
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
208
237
|
# to get a valid relative path
|
|
209
238
|
rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
@@ -223,6 +252,8 @@ class FsspecIndexer(Indexer):
|
|
|
223
252
|
display_name=file_path,
|
|
224
253
|
)
|
|
225
254
|
|
|
255
|
+
self.log_indexing_complete(f"{self.connector_type} files", total_files)
|
|
256
|
+
|
|
226
257
|
|
|
227
258
|
class FsspecDownloaderConfig(DownloaderConfig):
|
|
228
259
|
pass
|
|
@@ -272,25 +303,57 @@ class FsspecDownloader(Downloader):
|
|
|
272
303
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
273
304
|
download_path = self.get_download_path(file_data=file_data)
|
|
274
305
|
mkdir_concurrent_safe(download_path.parent)
|
|
306
|
+
|
|
307
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
308
|
+
file_size = file_data.metadata.filesize_bytes
|
|
309
|
+
self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
|
|
310
|
+
|
|
275
311
|
try:
|
|
276
|
-
rpath = file_data.additional_metadata["original_file_path"]
|
|
277
312
|
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
278
313
|
client.get_file(rpath=rpath, lpath=download_path.as_posix())
|
|
279
314
|
self.handle_directory_download(lpath=download_path)
|
|
315
|
+
|
|
280
316
|
except Exception as e:
|
|
317
|
+
self.log_error(
|
|
318
|
+
"File download failed",
|
|
319
|
+
error=e,
|
|
320
|
+
context={"file_path": rpath, "file_id": file_data.identifier},
|
|
321
|
+
)
|
|
281
322
|
raise self.wrap_error(e=e)
|
|
323
|
+
|
|
324
|
+
self.log_download_complete(
|
|
325
|
+
file_path=rpath,
|
|
326
|
+
file_id=file_data.identifier,
|
|
327
|
+
download_path=str(download_path),
|
|
328
|
+
)
|
|
329
|
+
|
|
282
330
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
283
331
|
|
|
284
332
|
async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
285
333
|
download_path = self.get_download_path(file_data=file_data)
|
|
286
334
|
mkdir_concurrent_safe(download_path.parent)
|
|
335
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
336
|
+
file_size = file_data.metadata.filesize_bytes
|
|
337
|
+
self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
|
|
338
|
+
|
|
287
339
|
try:
|
|
288
|
-
rpath = file_data.additional_metadata["original_file_path"]
|
|
289
340
|
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
290
341
|
await client.get_file(rpath=rpath, lpath=download_path.as_posix())
|
|
291
342
|
self.handle_directory_download(lpath=download_path)
|
|
292
343
|
except Exception as e:
|
|
344
|
+
self.log_error(
|
|
345
|
+
"File download failed",
|
|
346
|
+
error=e,
|
|
347
|
+
context={"file_path": rpath, "file_id": file_data.identifier},
|
|
348
|
+
)
|
|
293
349
|
raise self.wrap_error(e=e)
|
|
350
|
+
|
|
351
|
+
self.log_download_complete(
|
|
352
|
+
file_path=rpath,
|
|
353
|
+
file_id=file_data.identifier,
|
|
354
|
+
download_path=str(download_path),
|
|
355
|
+
)
|
|
356
|
+
|
|
294
357
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
295
358
|
|
|
296
359
|
|
|
@@ -321,6 +384,7 @@ class FsspecUploader(Uploader):
|
|
|
321
384
|
)
|
|
322
385
|
|
|
323
386
|
def __post_init__(self):
|
|
387
|
+
super().__post_init__()
|
|
324
388
|
# TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
|
|
325
389
|
if not self.upload_config:
|
|
326
390
|
raise TypeError(
|
|
@@ -334,6 +398,8 @@ class FsspecUploader(Uploader):
|
|
|
334
398
|
def precheck(self) -> None:
|
|
335
399
|
from fsspec import get_filesystem_class
|
|
336
400
|
|
|
401
|
+
self.log_operation_start("Connection validation", protocol=self.upload_config.protocol)
|
|
402
|
+
|
|
337
403
|
try:
|
|
338
404
|
fs = get_filesystem_class(self.upload_config.protocol)(
|
|
339
405
|
**self.connection_config.get_access_config(),
|
|
@@ -341,7 +407,16 @@ class FsspecUploader(Uploader):
|
|
|
341
407
|
upload_path = Path(self.upload_config.path_without_protocol) / "_empty"
|
|
342
408
|
fs.write_bytes(path=upload_path.as_posix(), value=b"")
|
|
343
409
|
except Exception as e:
|
|
410
|
+
self.log_connection_failed(
|
|
411
|
+
connector_type=self.connector_type,
|
|
412
|
+
error=e,
|
|
413
|
+
endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
|
|
414
|
+
)
|
|
344
415
|
raise self.wrap_error(e=e)
|
|
416
|
+
self.log_connection_validated(
|
|
417
|
+
connector_type=self.connector_type,
|
|
418
|
+
endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
|
|
419
|
+
)
|
|
345
420
|
|
|
346
421
|
def get_upload_path(self, file_data: FileData) -> Path:
|
|
347
422
|
upload_path = Path(
|
|
@@ -353,14 +428,31 @@ class FsspecUploader(Uploader):
|
|
|
353
428
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
354
429
|
path_str = str(path.resolve())
|
|
355
430
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
431
|
+
self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
|
|
432
|
+
try:
|
|
433
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
434
|
+
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
435
|
+
except Exception as e:
|
|
436
|
+
self.log_error(
|
|
437
|
+
"File upload failed",
|
|
438
|
+
error=e,
|
|
439
|
+
context={"file_path": path_str, "destination": upload_path.as_posix()},
|
|
440
|
+
)
|
|
441
|
+
raise self.wrap_error(e=e)
|
|
442
|
+
self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
|
|
359
443
|
|
|
360
444
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
361
445
|
path_str = str(path.resolve())
|
|
362
446
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
447
|
+
self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
|
|
448
|
+
try:
|
|
449
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
450
|
+
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
451
|
+
except Exception as e:
|
|
452
|
+
self.log_error(
|
|
453
|
+
"File upload failed",
|
|
454
|
+
error=e,
|
|
455
|
+
context={"file_path": path_str, "destination": upload_path.as_posix()},
|
|
456
|
+
)
|
|
457
|
+
raise self.wrap_error(e=e)
|
|
458
|
+
self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
|
|
@@ -104,7 +104,13 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
104
104
|
return UserError(message)
|
|
105
105
|
if http_code >= 500:
|
|
106
106
|
return ProviderError(message)
|
|
107
|
-
logger.error(
|
|
107
|
+
logger.error(
|
|
108
|
+
"Unhandled exception from S3 (type: %s, endpoint: %s): %s",
|
|
109
|
+
type(e).__name__,
|
|
110
|
+
self.endpoint_url or "default",
|
|
111
|
+
e,
|
|
112
|
+
exc_info=True,
|
|
113
|
+
)
|
|
108
114
|
return e
|
|
109
115
|
|
|
110
116
|
|
|
@@ -122,6 +128,10 @@ class S3Indexer(FsspecIndexer):
|
|
|
122
128
|
|
|
123
129
|
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
124
130
|
path = file_info["Key"]
|
|
131
|
+
|
|
132
|
+
self.log_debug("Getting metadata for S3 object", context={"file_path": path})
|
|
133
|
+
self.log_file_operation("Getting metadata", file_path=path)
|
|
134
|
+
|
|
125
135
|
date_created = None
|
|
126
136
|
date_modified = None
|
|
127
137
|
modified = file_info.get("LastModified")
|
|
@@ -147,9 +157,9 @@ class S3Indexer(FsspecIndexer):
|
|
|
147
157
|
record_locator["metadata"] = metadata
|
|
148
158
|
issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
|
|
149
159
|
if issue_characters:
|
|
150
|
-
|
|
151
|
-
f"File path
|
|
152
|
-
|
|
160
|
+
self.log_warning(
|
|
161
|
+
f"File path contains characters that can cause issues with S3: {issue_characters}",
|
|
162
|
+
context={"path": path, "problematic_characters": issue_characters},
|
|
153
163
|
)
|
|
154
164
|
return FileDataSourceMetadata(
|
|
155
165
|
date_created=date_created,
|
|
@@ -115,23 +115,24 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
115
115
|
except ValueError as exc:
|
|
116
116
|
logger.error("Couldn't set up credentials.")
|
|
117
117
|
raise exc
|
|
118
|
-
|
|
118
|
+
|
|
119
119
|
if "error" in token:
|
|
120
120
|
error_codes = token.get("error_codes", [])
|
|
121
121
|
error_type = token.get("error", "")
|
|
122
122
|
error_description = token.get("error_description", "")
|
|
123
|
-
|
|
123
|
+
|
|
124
124
|
# 7000215: Invalid client secret provided
|
|
125
125
|
# 7000218: Invalid client id provided
|
|
126
126
|
# 700016: Application not found in directory
|
|
127
127
|
# 90002: Tenant not found
|
|
128
128
|
auth_error_codes = [7000215, 7000218, 700016, 90002]
|
|
129
|
-
|
|
130
|
-
if
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
129
|
+
|
|
130
|
+
if any(code in error_codes for code in auth_error_codes) or error_type in [
|
|
131
|
+
"invalid_client",
|
|
132
|
+
"unauthorized_client",
|
|
133
|
+
"invalid_grant",
|
|
134
|
+
]:
|
|
135
|
+
raise UserAuthError(f"Authentication failed: {error_type}: {error_description}")
|
|
135
136
|
else:
|
|
136
137
|
raise SourceConnectionNetworkError(
|
|
137
138
|
f"Failed to fetch token: {error_type}: {error_description}"
|
|
@@ -87,6 +87,7 @@ class SharepointIndexerConfig(OnedriveIndexerConfig):
|
|
|
87
87
|
# TODO: We can probably make path non-optional on OnedriveIndexerConfig once tested
|
|
88
88
|
path: str = Field(default="")
|
|
89
89
|
|
|
90
|
+
|
|
90
91
|
@dataclass
|
|
91
92
|
class SharepointIndexer(OnedriveIndexer):
|
|
92
93
|
connection_config: SharepointConnectionConfig
|
|
@@ -114,14 +115,14 @@ class SharepointIndexer(OnedriveIndexer):
|
|
|
114
115
|
def _is_root_path(self, path: str) -> bool:
|
|
115
116
|
"""Check if the path represents root access (empty string or legacy default)."""
|
|
116
117
|
return not path or not path.strip() or path == LEGACY_DEFAULT_PATH
|
|
117
|
-
|
|
118
|
+
|
|
118
119
|
def _get_target_drive_item(self, site_drive_item: DriveItem, path: str) -> DriveItem:
|
|
119
120
|
"""Get the drive item to search in based on the path."""
|
|
120
121
|
if self._is_root_path(path):
|
|
121
122
|
return site_drive_item
|
|
122
123
|
else:
|
|
123
124
|
return site_drive_item.get_by_path(path).get().execute_query()
|
|
124
|
-
|
|
125
|
+
|
|
125
126
|
def _validate_folder_path(self, site_drive_item: DriveItem, path: str) -> None:
|
|
126
127
|
"""Validate that a specific folder path exists and is accessible."""
|
|
127
128
|
from office365.runtime.client_request_exception import ClientRequestException
|