unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
add_source_entry,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from .azure import CONNECTOR_TYPE as AZURE_CONNECTOR_TYPE
|
|
9
|
+
from .azure import azure_destination_entry, azure_source_entry
|
|
10
|
+
from .box import CONNECTOR_TYPE as BOX_CONNECTOR_TYPE
|
|
11
|
+
from .box import box_destination_entry, box_source_entry
|
|
12
|
+
from .dropbox import CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE
|
|
13
|
+
from .dropbox import dropbox_destination_entry, dropbox_source_entry
|
|
14
|
+
from .gcs import CONNECTOR_TYPE as GCS_CONNECTOR_TYPE
|
|
15
|
+
from .gcs import gcs_destination_entry, gcs_source_entry
|
|
16
|
+
from .s3 import CONNECTOR_TYPE as S3_CONNECTOR_TYPE
|
|
17
|
+
from .s3 import s3_destination_entry, s3_source_entry
|
|
18
|
+
from .sftp import CONNECTOR_TYPE as SFTP_CONNECTOR_TYPE
|
|
19
|
+
from .sftp import sftp_destination_entry, sftp_source_entry
|
|
20
|
+
|
|
21
|
+
add_source_entry(source_type=AZURE_CONNECTOR_TYPE, entry=azure_source_entry)
|
|
22
|
+
add_destination_entry(destination_type=AZURE_CONNECTOR_TYPE, entry=azure_destination_entry)
|
|
23
|
+
|
|
24
|
+
add_source_entry(source_type=BOX_CONNECTOR_TYPE, entry=box_source_entry)
|
|
25
|
+
add_destination_entry(destination_type=BOX_CONNECTOR_TYPE, entry=box_destination_entry)
|
|
26
|
+
|
|
27
|
+
add_source_entry(source_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_source_entry)
|
|
28
|
+
add_destination_entry(destination_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_destination_entry)
|
|
29
|
+
|
|
30
|
+
add_source_entry(source_type=GCS_CONNECTOR_TYPE, entry=gcs_source_entry)
|
|
31
|
+
add_destination_entry(destination_type=GCS_CONNECTOR_TYPE, entry=gcs_destination_entry)
|
|
32
|
+
|
|
33
|
+
add_source_entry(source_type=S3_CONNECTOR_TYPE, entry=s3_source_entry)
|
|
34
|
+
add_destination_entry(destination_type=S3_CONNECTOR_TYPE, entry=s3_destination_entry)
|
|
35
|
+
|
|
36
|
+
add_source_entry(source_type=SFTP_CONNECTOR_TYPE, entry=sftp_source_entry)
|
|
37
|
+
add_destination_entry(destination_type=SFTP_CONNECTOR_TYPE, entry=sftp_destination_entry)
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from time import time
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
|
|
12
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
13
|
+
from unstructured_ingest.v2.logger import logger
|
|
14
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
|
+
DestinationRegistryEntry,
|
|
16
|
+
SourceRegistryEntry,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
19
|
+
FsspecAccessConfig,
|
|
20
|
+
FsspecConnectionConfig,
|
|
21
|
+
FsspecDownloader,
|
|
22
|
+
FsspecDownloaderConfig,
|
|
23
|
+
FsspecIndexer,
|
|
24
|
+
FsspecIndexerConfig,
|
|
25
|
+
FsspecUploader,
|
|
26
|
+
FsspecUploaderConfig,
|
|
27
|
+
)
|
|
28
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from adlfs import AzureBlobFileSystem
|
|
32
|
+
|
|
33
|
+
CONNECTOR_TYPE = "azure"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def azure_json_serial(obj):
|
|
37
|
+
from azure.storage.blob._models import ContentSettings
|
|
38
|
+
|
|
39
|
+
if isinstance(obj, ContentSettings):
|
|
40
|
+
return dict(obj)
|
|
41
|
+
if isinstance(obj, bytearray):
|
|
42
|
+
return str(obj)
|
|
43
|
+
return json_serial(obj)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class AzureIndexerConfig(FsspecIndexerConfig):
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class AzureAccessConfig(FsspecAccessConfig):
|
|
51
|
+
account_name: Optional[str] = Field(
|
|
52
|
+
default=None,
|
|
53
|
+
description="The storage account name. This is used to authenticate "
|
|
54
|
+
"requests signed with an account key and to construct "
|
|
55
|
+
"the storage endpoint. It is required unless a connection "
|
|
56
|
+
"string is given, or if a custom domain is used with "
|
|
57
|
+
"anonymous authentication.",
|
|
58
|
+
)
|
|
59
|
+
account_key: Optional[str] = Field(
|
|
60
|
+
default=None,
|
|
61
|
+
description="The storage account key. This is used for shared key "
|
|
62
|
+
"authentication. If any of account key, sas token or "
|
|
63
|
+
"client_id are not specified, anonymous access will be used.",
|
|
64
|
+
)
|
|
65
|
+
connection_string: Optional[str] = Field(
|
|
66
|
+
default=None,
|
|
67
|
+
description="If specified, this will override all other parameters. See "
|
|
68
|
+
"http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
|
|
69
|
+
"for the connection string format.",
|
|
70
|
+
)
|
|
71
|
+
sas_token: Optional[str] = Field(
|
|
72
|
+
default=None,
|
|
73
|
+
description="A shared access signature token to use to authenticate "
|
|
74
|
+
"requests instead of the account key. If account key and "
|
|
75
|
+
"sas token are both specified, account key will be used "
|
|
76
|
+
"to sign. If any of account key, sas token or client_id "
|
|
77
|
+
"are not specified, anonymous access will be used.",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def model_post_init(self, __context: Any) -> None:
|
|
81
|
+
if self.connection_string is None and self.account_name is None:
|
|
82
|
+
raise ValueError("either connection_string or account_name must be set")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class AzureConnectionConfig(FsspecConnectionConfig):
|
|
86
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
|
|
87
|
+
access_config: Secret[AzureAccessConfig]
|
|
88
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
89
|
+
|
|
90
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
91
|
+
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
92
|
+
access_configs: dict[str, Any] = {
|
|
93
|
+
k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v
|
|
94
|
+
}
|
|
95
|
+
return access_configs
|
|
96
|
+
|
|
97
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
98
|
+
@contextmanager
|
|
99
|
+
def get_client(self, protocol: str) -> Generator["AzureBlobFileSystem", None, None]:
|
|
100
|
+
with super().get_client(protocol=protocol) as client:
|
|
101
|
+
yield client
|
|
102
|
+
|
|
103
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
104
|
+
from azure.core.exceptions import ClientAuthenticationError, HttpResponseError
|
|
105
|
+
|
|
106
|
+
if not isinstance(e, HttpResponseError):
|
|
107
|
+
logger.error(f"unhandled exception from azure ({type(e)}): {e}", exc_info=True)
|
|
108
|
+
return e
|
|
109
|
+
if isinstance(e, ClientAuthenticationError):
|
|
110
|
+
return UserAuthError(e.reason)
|
|
111
|
+
status_code = e.status_code
|
|
112
|
+
message = e.reason
|
|
113
|
+
if status_code is not None:
|
|
114
|
+
if 400 <= status_code < 500:
|
|
115
|
+
return UserError(message)
|
|
116
|
+
if status_code >= 500:
|
|
117
|
+
return ProviderError(message)
|
|
118
|
+
logger.error(f"unhandled exception from azure ({type(e)}): {e}", exc_info=True)
|
|
119
|
+
return e
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@dataclass
|
|
123
|
+
class AzureIndexer(FsspecIndexer):
|
|
124
|
+
connection_config: AzureConnectionConfig
|
|
125
|
+
index_config: AzureIndexerConfig
|
|
126
|
+
connector_type: str = CONNECTOR_TYPE
|
|
127
|
+
|
|
128
|
+
def sterilize_info(self, file_data: dict) -> dict:
|
|
129
|
+
return sterilize_dict(data=file_data, default=azure_json_serial)
|
|
130
|
+
|
|
131
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
132
|
+
path = file_data["name"]
|
|
133
|
+
date_created = (
|
|
134
|
+
str(file_data.get("creation_time").timestamp())
|
|
135
|
+
if "creation_time" in file_data
|
|
136
|
+
else None
|
|
137
|
+
)
|
|
138
|
+
date_modified = (
|
|
139
|
+
str(file_data.get("last_modified").timestamp())
|
|
140
|
+
if "last_modified" in file_data
|
|
141
|
+
else None
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
145
|
+
|
|
146
|
+
version = file_data.get("etag")
|
|
147
|
+
record_locator = {
|
|
148
|
+
"protocol": self.index_config.protocol,
|
|
149
|
+
"remote_file_path": self.index_config.remote_url,
|
|
150
|
+
}
|
|
151
|
+
return FileDataSourceMetadata(
|
|
152
|
+
date_created=date_created,
|
|
153
|
+
date_modified=date_modified,
|
|
154
|
+
date_processed=str(time()),
|
|
155
|
+
version=version,
|
|
156
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
157
|
+
record_locator=record_locator,
|
|
158
|
+
filesize_bytes=file_size,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class AzureDownloaderConfig(FsspecDownloaderConfig):
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@dataclass
|
|
167
|
+
class AzureDownloader(FsspecDownloader):
|
|
168
|
+
protocol: str = "az"
|
|
169
|
+
connection_config: AzureConnectionConfig
|
|
170
|
+
connector_type: str = CONNECTOR_TYPE
|
|
171
|
+
download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class AzureUploaderConfig(FsspecUploaderConfig):
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@dataclass
|
|
179
|
+
class AzureUploader(FsspecUploader):
|
|
180
|
+
connector_type: str = CONNECTOR_TYPE
|
|
181
|
+
connection_config: AzureConnectionConfig
|
|
182
|
+
upload_config: AzureUploaderConfig = field(default=None)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
azure_source_entry = SourceRegistryEntry(
|
|
186
|
+
indexer=AzureIndexer,
|
|
187
|
+
indexer_config=AzureIndexerConfig,
|
|
188
|
+
downloader=AzureDownloader,
|
|
189
|
+
downloader_config=AzureDownloaderConfig,
|
|
190
|
+
connection_config=AzureConnectionConfig,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
azure_destination_entry = DestinationRegistryEntry(
|
|
194
|
+
uploader=AzureUploader,
|
|
195
|
+
uploader_config=AzureUploaderConfig,
|
|
196
|
+
connection_config=AzureConnectionConfig,
|
|
197
|
+
)
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from time import time
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from dateutil import parser
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
from pydantic.functional_validators import BeforeValidator
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
|
|
14
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
15
|
+
from unstructured_ingest.v2.logger import logger
|
|
16
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
17
|
+
DestinationRegistryEntry,
|
|
18
|
+
SourceRegistryEntry,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
21
|
+
FsspecAccessConfig,
|
|
22
|
+
FsspecConnectionConfig,
|
|
23
|
+
FsspecDownloader,
|
|
24
|
+
FsspecDownloaderConfig,
|
|
25
|
+
FsspecIndexer,
|
|
26
|
+
FsspecIndexerConfig,
|
|
27
|
+
FsspecUploader,
|
|
28
|
+
FsspecUploaderConfig,
|
|
29
|
+
)
|
|
30
|
+
from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from boxfs import BoxFileSystem
|
|
34
|
+
|
|
35
|
+
CONNECTOR_TYPE = "box"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class BoxIndexerConfig(FsspecIndexerConfig):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class BoxAccessConfig(FsspecAccessConfig):
|
|
43
|
+
box_app_config: Annotated[dict, BeforeValidator(conform_string_to_dict)] = Field(
|
|
44
|
+
description="Box app credentials as a JSON string."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class BoxConnectionConfig(FsspecConnectionConfig):
|
|
49
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
|
|
50
|
+
access_config: Secret[BoxAccessConfig]
|
|
51
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
52
|
+
|
|
53
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
54
|
+
from boxsdk import JWTAuth
|
|
55
|
+
|
|
56
|
+
ac = self.access_config.get_secret_value()
|
|
57
|
+
settings_dict = ac.box_app_config
|
|
58
|
+
|
|
59
|
+
# Create and authenticate the JWTAuth object
|
|
60
|
+
oauth = JWTAuth.from_settings_dictionary(settings_dict)
|
|
61
|
+
oauth.authenticate_instance()
|
|
62
|
+
|
|
63
|
+
# if not oauth.access_token:
|
|
64
|
+
# raise SourceConnectionError("Authentication failed: No access token generated.")
|
|
65
|
+
|
|
66
|
+
# Prepare the access configuration with the authenticated oauth
|
|
67
|
+
access_kwargs_with_oauth: dict[str, Any] = {
|
|
68
|
+
"oauth": oauth,
|
|
69
|
+
}
|
|
70
|
+
access_config: dict[str, Any] = ac.model_dump()
|
|
71
|
+
access_config.pop("box_app_config", None)
|
|
72
|
+
access_kwargs_with_oauth.update(access_config)
|
|
73
|
+
|
|
74
|
+
return access_kwargs_with_oauth
|
|
75
|
+
|
|
76
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
77
|
+
from boxsdk.exception import BoxAPIException, BoxOAuthException
|
|
78
|
+
|
|
79
|
+
if isinstance(e, BoxOAuthException):
|
|
80
|
+
return UserAuthError(e.message)
|
|
81
|
+
if not isinstance(e, BoxAPIException):
|
|
82
|
+
logger.error(f"unhandled exception from box ({type(e)}): {e}", exc_info=True)
|
|
83
|
+
return e
|
|
84
|
+
message = e.message or e
|
|
85
|
+
if error_code_status := e.status:
|
|
86
|
+
if 400 <= error_code_status < 500:
|
|
87
|
+
return UserError(message)
|
|
88
|
+
if error_code_status >= 500:
|
|
89
|
+
return ProviderError(message)
|
|
90
|
+
|
|
91
|
+
logger.error(f"unhandled exception from box ({type(e)}): {e}", exc_info=True)
|
|
92
|
+
return e
|
|
93
|
+
|
|
94
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
95
|
+
@contextmanager
|
|
96
|
+
def get_client(self, protocol: str) -> Generator["BoxFileSystem", None, None]:
|
|
97
|
+
with super().get_client(protocol=protocol) as client:
|
|
98
|
+
yield client
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class BoxIndexer(FsspecIndexer):
|
|
103
|
+
connection_config: BoxConnectionConfig
|
|
104
|
+
index_config: BoxIndexerConfig
|
|
105
|
+
connector_type: str = CONNECTOR_TYPE
|
|
106
|
+
|
|
107
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
108
|
+
path = file_data["name"]
|
|
109
|
+
date_created = None
|
|
110
|
+
date_modified = None
|
|
111
|
+
if modified_at_str := file_data.get("modified_at"):
|
|
112
|
+
date_modified = str(parser.parse(modified_at_str).timestamp())
|
|
113
|
+
if created_at_str := file_data.get("created_at"):
|
|
114
|
+
date_created = str(parser.parse(created_at_str).timestamp())
|
|
115
|
+
|
|
116
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
117
|
+
|
|
118
|
+
version = file_data.get("id")
|
|
119
|
+
record_locator = {
|
|
120
|
+
"protocol": self.index_config.protocol,
|
|
121
|
+
"remote_file_path": self.index_config.remote_url,
|
|
122
|
+
"file_id": file_data.get("id"),
|
|
123
|
+
}
|
|
124
|
+
return FileDataSourceMetadata(
|
|
125
|
+
date_created=date_created,
|
|
126
|
+
date_modified=date_modified,
|
|
127
|
+
date_processed=str(time()),
|
|
128
|
+
version=version,
|
|
129
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
130
|
+
record_locator=record_locator,
|
|
131
|
+
filesize_bytes=file_size,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class BoxDownloaderConfig(FsspecDownloaderConfig):
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass
|
|
140
|
+
class BoxDownloader(FsspecDownloader):
|
|
141
|
+
protocol: str = "box"
|
|
142
|
+
connection_config: BoxConnectionConfig
|
|
143
|
+
connector_type: str = CONNECTOR_TYPE
|
|
144
|
+
download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class BoxUploaderConfig(FsspecUploaderConfig):
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@dataclass
|
|
152
|
+
class BoxUploader(FsspecUploader):
|
|
153
|
+
connector_type: str = CONNECTOR_TYPE
|
|
154
|
+
connection_config: BoxConnectionConfig
|
|
155
|
+
upload_config: BoxUploaderConfig = field(default=None)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
box_source_entry = SourceRegistryEntry(
|
|
159
|
+
indexer=BoxIndexer,
|
|
160
|
+
indexer_config=BoxIndexerConfig,
|
|
161
|
+
downloader=BoxDownloader,
|
|
162
|
+
downloader_config=BoxDownloaderConfig,
|
|
163
|
+
connection_config=BoxConnectionConfig,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
box_destination_entry = DestinationRegistryEntry(
|
|
167
|
+
uploader=BoxUploader,
|
|
168
|
+
uploader_config=BoxUploaderConfig,
|
|
169
|
+
connection_config=BoxConnectionConfig,
|
|
170
|
+
)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from time import time
|
|
6
|
+
from typing import TYPE_CHECKING, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.errors import (
|
|
12
|
+
ProviderError,
|
|
13
|
+
UserAuthError,
|
|
14
|
+
UserError,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.v2.errors import (
|
|
17
|
+
RateLimitError as CustomRateLimitError,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
20
|
+
from unstructured_ingest.v2.logger import logger
|
|
21
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
|
+
DestinationRegistryEntry,
|
|
23
|
+
SourceRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
26
|
+
FsspecAccessConfig,
|
|
27
|
+
FsspecConnectionConfig,
|
|
28
|
+
FsspecDownloader,
|
|
29
|
+
FsspecDownloaderConfig,
|
|
30
|
+
FsspecIndexer,
|
|
31
|
+
FsspecIndexerConfig,
|
|
32
|
+
FsspecUploader,
|
|
33
|
+
FsspecUploaderConfig,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from dropboxdrivefs import DropboxDriveFileSystem
|
|
38
|
+
|
|
39
|
+
CONNECTOR_TYPE = "dropbox"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DropboxIndexerConfig(FsspecIndexerConfig):
|
|
43
|
+
def model_post_init(self, __context):
|
|
44
|
+
if not self.path_without_protocol.startswith("/"):
|
|
45
|
+
self.path_without_protocol = "/" + self.path_without_protocol
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DropboxAccessConfig(FsspecAccessConfig):
|
|
49
|
+
token: Optional[str] = Field(default=None, description="Dropbox access token.")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
53
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"], init=False)
|
|
54
|
+
access_config: Secret[DropboxAccessConfig] = Field(
|
|
55
|
+
default=DropboxAccessConfig(), validate_default=True
|
|
56
|
+
)
|
|
57
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
58
|
+
|
|
59
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
60
|
+
@contextmanager
|
|
61
|
+
def get_client(self, protocol: str) -> Generator["DropboxDriveFileSystem", None, None]:
|
|
62
|
+
with super().get_client(protocol=protocol) as client:
|
|
63
|
+
yield client
|
|
64
|
+
|
|
65
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
66
|
+
from dropbox.exceptions import AuthError, HttpError, RateLimitError
|
|
67
|
+
|
|
68
|
+
if not isinstance(e, HttpError):
|
|
69
|
+
logger.error(f"unhandled exception from dropbox ({type(e)}): {e}", exc_info=True)
|
|
70
|
+
return e
|
|
71
|
+
if isinstance(e, AuthError):
|
|
72
|
+
raise UserAuthError(e.error)
|
|
73
|
+
if isinstance(e, RateLimitError):
|
|
74
|
+
return CustomRateLimitError(e.error)
|
|
75
|
+
status_code = e.status_code
|
|
76
|
+
if 400 <= status_code < 500:
|
|
77
|
+
if body := getattr(e, "body", None):
|
|
78
|
+
return UserError(body)
|
|
79
|
+
else:
|
|
80
|
+
return UserError(e.body)
|
|
81
|
+
if status_code >= 500:
|
|
82
|
+
if body := getattr(e, "body", None):
|
|
83
|
+
return ProviderError(body)
|
|
84
|
+
else:
|
|
85
|
+
return ProviderError(e.body)
|
|
86
|
+
logger.error(f"unhandled exception from dropbox ({type(e)}): {e}", exc_info=True)
|
|
87
|
+
return e
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class DropboxIndexer(FsspecIndexer):
|
|
92
|
+
connection_config: DropboxConnectionConfig
|
|
93
|
+
index_config: DropboxIndexerConfig
|
|
94
|
+
connector_type: str = CONNECTOR_TYPE
|
|
95
|
+
|
|
96
|
+
def get_path(self, file_data: dict) -> str:
|
|
97
|
+
return file_data["name"]
|
|
98
|
+
|
|
99
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
100
|
+
path = file_data["name"].lstrip("/")
|
|
101
|
+
date_created = None
|
|
102
|
+
date_modified = None
|
|
103
|
+
server_modified = file_data.get("server_modified")
|
|
104
|
+
client_modified = file_data.get("client_modified")
|
|
105
|
+
if server_modified and client_modified and server_modified > client_modified:
|
|
106
|
+
date_created = str(client_modified.timestamp())
|
|
107
|
+
date_modified = str(server_modified.timestamp())
|
|
108
|
+
elif server_modified and client_modified and server_modified < client_modified:
|
|
109
|
+
date_created = str(server_modified.timestamp())
|
|
110
|
+
date_modified = str(client_modified.timestamp())
|
|
111
|
+
|
|
112
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
113
|
+
|
|
114
|
+
version = file_data.get("content_hash")
|
|
115
|
+
record_locator = {
|
|
116
|
+
"protocol": self.index_config.protocol,
|
|
117
|
+
"remote_file_path": self.index_config.remote_url,
|
|
118
|
+
"file_id": file_data.get("id"),
|
|
119
|
+
}
|
|
120
|
+
return FileDataSourceMetadata(
|
|
121
|
+
date_created=date_created,
|
|
122
|
+
date_modified=date_modified,
|
|
123
|
+
date_processed=str(time()),
|
|
124
|
+
version=version,
|
|
125
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
126
|
+
record_locator=record_locator,
|
|
127
|
+
filesize_bytes=file_size,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class DropboxDownloaderConfig(FsspecDownloaderConfig):
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass
|
|
136
|
+
class DropboxDownloader(FsspecDownloader):
|
|
137
|
+
protocol: str = "dropbox"
|
|
138
|
+
connection_config: DropboxConnectionConfig
|
|
139
|
+
connector_type: str = CONNECTOR_TYPE
|
|
140
|
+
download_config: Optional[DropboxDownloaderConfig] = field(
|
|
141
|
+
default_factory=DropboxDownloaderConfig
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class DropboxUploaderConfig(FsspecUploaderConfig):
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class DropboxUploader(FsspecUploader):
|
|
151
|
+
connector_type: str = CONNECTOR_TYPE
|
|
152
|
+
connection_config: DropboxConnectionConfig
|
|
153
|
+
upload_config: DropboxUploaderConfig = field(default=None)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
dropbox_source_entry = SourceRegistryEntry(
|
|
157
|
+
indexer=DropboxIndexer,
|
|
158
|
+
indexer_config=DropboxIndexerConfig,
|
|
159
|
+
downloader=DropboxDownloader,
|
|
160
|
+
downloader_config=DropboxDownloaderConfig,
|
|
161
|
+
connection_config=DropboxConnectionConfig,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
dropbox_destination_entry = DestinationRegistryEntry(
|
|
165
|
+
uploader=DropboxUploader,
|
|
166
|
+
uploader_config=DropboxUploaderConfig,
|
|
167
|
+
connection_config=DropboxConnectionConfig,
|
|
168
|
+
)
|