unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +1 -1
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
- unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Box Connector
|
|
3
|
-
Box does not make it simple to download files with an App.
|
|
4
|
-
First of all, this does not work with a free Box account.
|
|
5
|
-
Make sure the App service email is a collaborator for your folder (co-owner or editor)
|
|
6
|
-
Make sure you have the 'write all files' application scope
|
|
7
|
-
Maybe check 'Make api calls as the as-user header'
|
|
8
|
-
REAUTHORIZE app after making any of the above changes
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import typing as t
|
|
12
|
-
from dataclasses import dataclass
|
|
13
|
-
|
|
14
|
-
from unstructured_ingest.connector.fsspec.fsspec import (
|
|
15
|
-
FsspecDestinationConnector,
|
|
16
|
-
FsspecIngestDoc,
|
|
17
|
-
FsspecSourceConnector,
|
|
18
|
-
FsspecWriteConfig,
|
|
19
|
-
SimpleFsspecConfig,
|
|
20
|
-
)
|
|
21
|
-
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
22
|
-
from unstructured_ingest.interfaces import AccessConfig
|
|
23
|
-
from unstructured_ingest.logger import logger
|
|
24
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class AccessTokenError(Exception):
|
|
28
|
-
"""There is a problem with the Access Token."""
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@dataclass
|
|
32
|
-
class BoxWriteConfig(FsspecWriteConfig):
|
|
33
|
-
pass
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
@dataclass
|
|
37
|
-
class BoxAccessConfig(AccessConfig):
|
|
38
|
-
box_app_config: t.Optional[str] = None
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@dataclass
|
|
42
|
-
class SimpleBoxConfig(SimpleFsspecConfig):
|
|
43
|
-
access_config: BoxAccessConfig = None
|
|
44
|
-
|
|
45
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
46
|
-
def get_access_config(self) -> dict:
|
|
47
|
-
# Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
|
|
48
|
-
# because it is not serializable.
|
|
49
|
-
from boxsdk import JWTAuth
|
|
50
|
-
|
|
51
|
-
access_kwargs_with_oauth: dict[str, t.Any] = {
|
|
52
|
-
"oauth": JWTAuth.from_settings_file(
|
|
53
|
-
self.access_config.box_app_config,
|
|
54
|
-
),
|
|
55
|
-
}
|
|
56
|
-
access_config: dict[str, t.Any] = self.access_config.to_dict()
|
|
57
|
-
access_config.pop("box_app_config", None)
|
|
58
|
-
access_kwargs_with_oauth.update(access_config)
|
|
59
|
-
|
|
60
|
-
return access_kwargs_with_oauth
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
@dataclass
|
|
64
|
-
class BoxIngestDoc(FsspecIngestDoc):
|
|
65
|
-
connector_config: SimpleBoxConfig
|
|
66
|
-
registry_name: str = "box"
|
|
67
|
-
|
|
68
|
-
@SourceConnectionError.wrap
|
|
69
|
-
@requires_dependencies(["boxfs", "fsspec"], extras="box")
|
|
70
|
-
def get_file(self):
|
|
71
|
-
super().get_file()
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
@dataclass
|
|
75
|
-
class BoxSourceConnector(FsspecSourceConnector):
|
|
76
|
-
connector_config: SimpleBoxConfig
|
|
77
|
-
|
|
78
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
79
|
-
def check_connection(self):
|
|
80
|
-
from boxfs import BoxFileSystem
|
|
81
|
-
|
|
82
|
-
try:
|
|
83
|
-
BoxFileSystem(**self.connector_config.get_access_config())
|
|
84
|
-
except Exception as e:
|
|
85
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
86
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
87
|
-
|
|
88
|
-
def __post_init__(self):
|
|
89
|
-
self.ingest_doc_cls: t.Type[BoxIngestDoc] = BoxIngestDoc
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
@dataclass
|
|
93
|
-
class BoxDestinationConnector(FsspecDestinationConnector):
|
|
94
|
-
connector_config: SimpleBoxConfig
|
|
95
|
-
write_config: BoxWriteConfig
|
|
96
|
-
|
|
97
|
-
@requires_dependencies(["boxfs", "fsspec"], extras="box")
|
|
98
|
-
def initialize(self):
|
|
99
|
-
super().initialize()
|
|
100
|
-
|
|
101
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
102
|
-
def check_connection(self):
|
|
103
|
-
from boxfs import BoxFileSystem
|
|
104
|
-
|
|
105
|
-
try:
|
|
106
|
-
BoxFileSystem(**self.connector_config.get_access_config())
|
|
107
|
-
except Exception as e:
|
|
108
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
109
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Dropbox Connector
|
|
3
|
-
The Dropbox Connector presents a couple abnormal situations.
|
|
4
|
-
1) They don't have an unexpiring token
|
|
5
|
-
2) They require a forward slash `/` in front of the remote_file_path. This presents
|
|
6
|
-
some real problems creating paths. When appending a path that begins with a
|
|
7
|
-
forward slash to any path, whether using the / shorthand or joinpath, causes the
|
|
8
|
-
starting path to disappear. So the `/` needs to be stripped off.
|
|
9
|
-
3) To list and get files from the root directory Dropbox you need a ""," ", or " /"
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
import re
|
|
13
|
-
from dataclasses import dataclass
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
from typing import Type
|
|
16
|
-
|
|
17
|
-
from unstructured_ingest.connector.fsspec.fsspec import (
|
|
18
|
-
FsspecDestinationConnector,
|
|
19
|
-
FsspecIngestDoc,
|
|
20
|
-
FsspecSourceConnector,
|
|
21
|
-
FsspecWriteConfig,
|
|
22
|
-
SimpleFsspecConfig,
|
|
23
|
-
)
|
|
24
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
25
|
-
from unstructured_ingest.error import SourceConnectionError
|
|
26
|
-
from unstructured_ingest.interfaces import AccessConfig
|
|
27
|
-
from unstructured_ingest.logger import logger
|
|
28
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class MissingFolderError(Exception):
|
|
32
|
-
"""There is no folder by that name. For root try `dropbox:// /`"""
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@dataclass
|
|
36
|
-
class DropboxAccessConfig(AccessConfig):
|
|
37
|
-
token: str = enhanced_field(sensitive=True)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@dataclass
|
|
41
|
-
class DropboxWriteConfig(FsspecWriteConfig):
|
|
42
|
-
pass
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@dataclass
|
|
46
|
-
class SimpleDropboxConfig(SimpleFsspecConfig):
|
|
47
|
-
access_config: DropboxAccessConfig = None
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@dataclass
|
|
51
|
-
class DropboxIngestDoc(FsspecIngestDoc):
|
|
52
|
-
connector_config: SimpleDropboxConfig
|
|
53
|
-
registry_name: str = "dropbox"
|
|
54
|
-
|
|
55
|
-
@SourceConnectionError.wrap
|
|
56
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
57
|
-
def get_file(self):
|
|
58
|
-
super().get_file()
|
|
59
|
-
|
|
60
|
-
@property
|
|
61
|
-
def _output_filename(self):
|
|
62
|
-
# Dropbox requires a forward slash at the front of the folder path. This
|
|
63
|
-
# creates some complications in path joining so a custom path is created here.
|
|
64
|
-
# Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root
|
|
65
|
-
if self.connector_config.dir_path == " ":
|
|
66
|
-
return Path(self.processor_config.output_dir) / re.sub(
|
|
67
|
-
"^/",
|
|
68
|
-
"",
|
|
69
|
-
f"{self.remote_file_path}.json",
|
|
70
|
-
)
|
|
71
|
-
else:
|
|
72
|
-
return (
|
|
73
|
-
Path(self.processor_config.output_dir)
|
|
74
|
-
/ f"{self.remote_file_path.replace(f'/{self.connector_config.dir_path}/', '')}.json"
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
def _tmp_download_file(self):
|
|
78
|
-
# Dropbox requires a forward slash at the front of the folder path. This
|
|
79
|
-
# creates some complications in path joining so a custom path is created here.
|
|
80
|
-
# Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root
|
|
81
|
-
download_dir: str = self.read_config.download_dir if self.read_config.download_dir else ""
|
|
82
|
-
if not download_dir:
|
|
83
|
-
return ""
|
|
84
|
-
if self.connector_config.dir_path == " ":
|
|
85
|
-
return Path(download_dir) / re.sub(
|
|
86
|
-
"^/",
|
|
87
|
-
"",
|
|
88
|
-
self.remote_file_path,
|
|
89
|
-
)
|
|
90
|
-
else:
|
|
91
|
-
return Path(download_dir) / self.remote_file_path.replace(
|
|
92
|
-
f"/{self.connector_config.dir_path}/",
|
|
93
|
-
"",
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
@dataclass
|
|
98
|
-
class DropboxSourceConnector(FsspecSourceConnector):
|
|
99
|
-
connector_config: SimpleDropboxConfig
|
|
100
|
-
|
|
101
|
-
def __post_init__(self):
|
|
102
|
-
self.ingest_doc_cls: Type[DropboxIngestDoc] = DropboxIngestDoc
|
|
103
|
-
|
|
104
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
105
|
-
def initialize(self):
|
|
106
|
-
from fsspec import AbstractFileSystem, get_filesystem_class
|
|
107
|
-
|
|
108
|
-
try:
|
|
109
|
-
self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
|
|
110
|
-
**self.connector_config.get_access_config(),
|
|
111
|
-
)
|
|
112
|
-
# Dropbox requires a forward slash at the front of the folder path. This
|
|
113
|
-
# creates some complications in path joining so a custom path is created here.
|
|
114
|
-
ls_output = self.fs.ls(f"/{self.connector_config.path_without_protocol}")
|
|
115
|
-
except Exception as e:
|
|
116
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
117
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
118
|
-
if ls_output and len(ls_output) >= 1:
|
|
119
|
-
return
|
|
120
|
-
elif ls_output:
|
|
121
|
-
raise ValueError(
|
|
122
|
-
f"No objects found in {self.connector_config.remote_url}.",
|
|
123
|
-
)
|
|
124
|
-
else:
|
|
125
|
-
raise MissingFolderError(
|
|
126
|
-
"There is no folder by that name. For root try `dropbox:// /`",
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
def _list_files(self):
|
|
130
|
-
# Dropbox requires a forward slash at the front of the folder path. This
|
|
131
|
-
# creates some complications in path joining so a custom path is created here.
|
|
132
|
-
if not self.connector_config.recursive:
|
|
133
|
-
# fs.ls does not walk directories
|
|
134
|
-
# directories that are listed in cloud storage can cause problems because they are seen
|
|
135
|
-
# as 0byte files
|
|
136
|
-
return [
|
|
137
|
-
x.get("name")
|
|
138
|
-
for x in self.fs.ls(
|
|
139
|
-
f"/{self.connector_config.path_without_protocol}",
|
|
140
|
-
detail=True,
|
|
141
|
-
)
|
|
142
|
-
if x.get("size")
|
|
143
|
-
]
|
|
144
|
-
else:
|
|
145
|
-
# fs.find will recursively walk directories
|
|
146
|
-
# "size" is a common key for all the cloud protocols with fs
|
|
147
|
-
return [
|
|
148
|
-
k
|
|
149
|
-
for k, v in self.fs.find(
|
|
150
|
-
f"/{self.connector_config.path_without_protocol}",
|
|
151
|
-
detail=True,
|
|
152
|
-
).items()
|
|
153
|
-
if v.get("size")
|
|
154
|
-
]
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
@dataclass
|
|
158
|
-
class DropboxDestinationConnector(FsspecDestinationConnector):
|
|
159
|
-
connector_config: SimpleFsspecConfig
|
|
160
|
-
write_config: DropboxWriteConfig
|
|
@@ -1,359 +0,0 @@
|
|
|
1
|
-
import fnmatch
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
import typing as t
|
|
5
|
-
from abc import ABC
|
|
6
|
-
from contextlib import suppress
|
|
7
|
-
from dataclasses import dataclass
|
|
8
|
-
from pathlib import Path, PurePath
|
|
9
|
-
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
11
|
-
from unstructured_ingest.error import (
|
|
12
|
-
DestinationConnectionError,
|
|
13
|
-
SourceConnectionError,
|
|
14
|
-
SourceConnectionNetworkError,
|
|
15
|
-
)
|
|
16
|
-
from unstructured_ingest.interfaces import (
|
|
17
|
-
BaseConnectorConfig,
|
|
18
|
-
BaseDestinationConnector,
|
|
19
|
-
BaseSingleIngestDoc,
|
|
20
|
-
BaseSourceConnector,
|
|
21
|
-
FsspecConfig,
|
|
22
|
-
IngestDocCleanupMixin,
|
|
23
|
-
SourceConnectorCleanupMixin,
|
|
24
|
-
SourceMetadata,
|
|
25
|
-
WriteConfig,
|
|
26
|
-
)
|
|
27
|
-
from unstructured_ingest.logger import logger
|
|
28
|
-
from unstructured_ingest.utils.compression import (
|
|
29
|
-
TAR_FILE_EXT,
|
|
30
|
-
ZIP_FILE_EXT,
|
|
31
|
-
CompressionSourceConnectorMixin,
|
|
32
|
-
)
|
|
33
|
-
from unstructured_ingest.utils.dep_check import (
|
|
34
|
-
requires_dependencies,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
|
|
38
|
-
"s3",
|
|
39
|
-
"s3a",
|
|
40
|
-
"abfs",
|
|
41
|
-
"az",
|
|
42
|
-
"gs",
|
|
43
|
-
"gcs",
|
|
44
|
-
"box",
|
|
45
|
-
"dropbox",
|
|
46
|
-
"sftp",
|
|
47
|
-
]
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@dataclass
|
|
51
|
-
class SimpleFsspecConfig(FsspecConfig, BaseConnectorConfig):
|
|
52
|
-
pass
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
@dataclass
|
|
56
|
-
class FsspecIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
57
|
-
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
58
|
-
doing the processing!).
|
|
59
|
-
|
|
60
|
-
Also includes a cleanup method. When things go wrong and the cleanup
|
|
61
|
-
method is not called, the file is left behind on the filesystem to assist debugging.
|
|
62
|
-
"""
|
|
63
|
-
|
|
64
|
-
connector_config: SimpleFsspecConfig
|
|
65
|
-
remote_file_path: str
|
|
66
|
-
|
|
67
|
-
def _tmp_download_file(self):
|
|
68
|
-
download_dir = self.read_config.download_dir if self.read_config.download_dir else ""
|
|
69
|
-
return Path(download_dir) / self.remote_file_path.replace(
|
|
70
|
-
f"{self.connector_config.dir_path}/",
|
|
71
|
-
"",
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
@property
|
|
75
|
-
def _output_filename(self):
|
|
76
|
-
# Dynamically parse filename , can change if remote path was pointing to the single
|
|
77
|
-
# file, a directory, or nested directory
|
|
78
|
-
if self.remote_file_path == self.connector_config.path_without_protocol:
|
|
79
|
-
file = self.remote_file_path.split("/")[-1]
|
|
80
|
-
filename = f"{file}.json"
|
|
81
|
-
else:
|
|
82
|
-
path_without_protocol = (
|
|
83
|
-
self.connector_config.path_without_protocol
|
|
84
|
-
if self.connector_config.path_without_protocol.endswith("/")
|
|
85
|
-
else f"{self.connector_config.path_without_protocol}/"
|
|
86
|
-
)
|
|
87
|
-
filename = f"{self.remote_file_path.replace(path_without_protocol, '')}.json"
|
|
88
|
-
return Path(self.processor_config.output_dir) / filename
|
|
89
|
-
|
|
90
|
-
def _create_full_tmp_dir_path(self):
|
|
91
|
-
"""Includes "directories" in the object path"""
|
|
92
|
-
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
|
93
|
-
|
|
94
|
-
@SourceConnectionError.wrap
|
|
95
|
-
@BaseSingleIngestDoc.skip_if_file_exists
|
|
96
|
-
def get_file(self):
|
|
97
|
-
"""Fetches the file from the current filesystem and stores it locally."""
|
|
98
|
-
from fsspec import AbstractFileSystem, get_filesystem_class
|
|
99
|
-
|
|
100
|
-
self._create_full_tmp_dir_path()
|
|
101
|
-
fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
|
|
102
|
-
**self.connector_config.get_access_config(),
|
|
103
|
-
)
|
|
104
|
-
self._get_file(fs=fs)
|
|
105
|
-
fs.get(rpath=self.remote_file_path, lpath=self._tmp_download_file().as_posix())
|
|
106
|
-
self.update_source_metadata()
|
|
107
|
-
|
|
108
|
-
@SourceConnectionNetworkError.wrap
|
|
109
|
-
def _get_file(self, fs):
|
|
110
|
-
fs.get(rpath=self.remote_file_path, lpath=self._tmp_download_file().as_posix())
|
|
111
|
-
|
|
112
|
-
@requires_dependencies(["fsspec"])
|
|
113
|
-
def update_source_metadata(self):
|
|
114
|
-
from fsspec import AbstractFileSystem, get_filesystem_class
|
|
115
|
-
|
|
116
|
-
fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
|
|
117
|
-
**self.connector_config.get_access_config(),
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
date_created = None
|
|
121
|
-
with suppress(NotImplementedError):
|
|
122
|
-
date_created = fs.created(self.remote_file_path).isoformat()
|
|
123
|
-
|
|
124
|
-
date_modified = None
|
|
125
|
-
with suppress(NotImplementedError):
|
|
126
|
-
date_modified = fs.modified(self.remote_file_path).isoformat()
|
|
127
|
-
|
|
128
|
-
version = (
|
|
129
|
-
fs.checksum(self.remote_file_path)
|
|
130
|
-
if self.connector_config.protocol != "gs"
|
|
131
|
-
else fs.info(self.remote_file_path).get("etag", "")
|
|
132
|
-
)
|
|
133
|
-
file_exists = fs.exists(self.remote_file_path)
|
|
134
|
-
self.source_metadata = SourceMetadata(
|
|
135
|
-
date_created=date_created,
|
|
136
|
-
date_modified=date_modified,
|
|
137
|
-
version=str(version),
|
|
138
|
-
source_url=f"{self.connector_config.protocol}://{self.remote_file_path}",
|
|
139
|
-
exists=file_exists,
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
@property
|
|
143
|
-
def filename(self):
|
|
144
|
-
"""The filename of the file after downloading from cloud"""
|
|
145
|
-
return self._tmp_download_file()
|
|
146
|
-
|
|
147
|
-
@property
|
|
148
|
-
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
149
|
-
"""Returns the equivalent of ls in dict"""
|
|
150
|
-
return {
|
|
151
|
-
"protocol": self.connector_config.protocol,
|
|
152
|
-
"remote_file_path": self.remote_file_path,
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
@dataclass
|
|
157
|
-
class FsspecSourceConnector(
|
|
158
|
-
SourceConnectorCleanupMixin,
|
|
159
|
-
CompressionSourceConnectorMixin,
|
|
160
|
-
BaseSourceConnector,
|
|
161
|
-
):
|
|
162
|
-
"""Objects of this class support fetching document(s) from"""
|
|
163
|
-
|
|
164
|
-
connector_config: SimpleFsspecConfig
|
|
165
|
-
|
|
166
|
-
def check_connection(self):
|
|
167
|
-
from fsspec import get_filesystem_class
|
|
168
|
-
|
|
169
|
-
try:
|
|
170
|
-
fs = get_filesystem_class(self.connector_config.protocol)(
|
|
171
|
-
**self.connector_config.get_access_config(),
|
|
172
|
-
)
|
|
173
|
-
fs.ls(path=self.connector_config.path_without_protocol, detail=False)
|
|
174
|
-
except Exception as e:
|
|
175
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
176
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
177
|
-
|
|
178
|
-
def __post_init__(self):
|
|
179
|
-
self.ingest_doc_cls: t.Type[FsspecIngestDoc] = FsspecIngestDoc
|
|
180
|
-
|
|
181
|
-
def initialize(self):
|
|
182
|
-
from fsspec import AbstractFileSystem, get_filesystem_class
|
|
183
|
-
|
|
184
|
-
self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
|
|
185
|
-
**self.connector_config.get_access_config(),
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
"""Verify that can get metadata for an object, validates connections info."""
|
|
189
|
-
ls_output = self.fs.ls(self.connector_config.path_without_protocol, detail=False)
|
|
190
|
-
if len(ls_output) < 1:
|
|
191
|
-
raise ValueError(
|
|
192
|
-
f"No objects found in {self.connector_config.remote_url}.",
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
def _list_files(self):
|
|
196
|
-
if not self.connector_config.recursive:
|
|
197
|
-
# fs.ls does not walk directories
|
|
198
|
-
# directories that are listed in cloud storage can cause problems
|
|
199
|
-
# because they are seen as 0 byte files
|
|
200
|
-
return [
|
|
201
|
-
x.get("name")
|
|
202
|
-
for x in self.fs.ls(self.connector_config.path_without_protocol, detail=True)
|
|
203
|
-
if x.get("size") > 0
|
|
204
|
-
]
|
|
205
|
-
else:
|
|
206
|
-
# fs.find will recursively walk directories
|
|
207
|
-
# "size" is a common key for all the cloud protocols with fs
|
|
208
|
-
return [
|
|
209
|
-
k
|
|
210
|
-
for k, v in self.fs.find(
|
|
211
|
-
self.connector_config.path_without_protocol,
|
|
212
|
-
detail=True,
|
|
213
|
-
).items()
|
|
214
|
-
if v.get("size") > 0
|
|
215
|
-
]
|
|
216
|
-
|
|
217
|
-
def does_path_match_glob(self, path: str) -> bool:
|
|
218
|
-
if self.connector_config.file_glob is None:
|
|
219
|
-
return True
|
|
220
|
-
patterns = self.connector_config.file_glob
|
|
221
|
-
for pattern in patterns:
|
|
222
|
-
if fnmatch.filter([path], pattern):
|
|
223
|
-
return True
|
|
224
|
-
logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
|
|
225
|
-
return False
|
|
226
|
-
|
|
227
|
-
def get_ingest_docs(self):
|
|
228
|
-
raw_files = self._list_files()
|
|
229
|
-
# If glob filters provided, use to filter on filepaths
|
|
230
|
-
files = [f for f in raw_files if self.does_path_match_glob(f)]
|
|
231
|
-
# remove compressed files
|
|
232
|
-
compressed_file_ext = TAR_FILE_EXT + ZIP_FILE_EXT
|
|
233
|
-
compressed_files = []
|
|
234
|
-
uncompressed_files = []
|
|
235
|
-
docs: t.List[BaseSingleIngestDoc] = []
|
|
236
|
-
for file in files:
|
|
237
|
-
if any(file.endswith(ext) for ext in compressed_file_ext):
|
|
238
|
-
compressed_files.append(file)
|
|
239
|
-
else:
|
|
240
|
-
uncompressed_files.append(file)
|
|
241
|
-
docs.extend(
|
|
242
|
-
[
|
|
243
|
-
self.ingest_doc_cls(
|
|
244
|
-
read_config=self.read_config,
|
|
245
|
-
connector_config=self.connector_config,
|
|
246
|
-
processor_config=self.processor_config,
|
|
247
|
-
remote_file_path=file,
|
|
248
|
-
)
|
|
249
|
-
for file in uncompressed_files
|
|
250
|
-
],
|
|
251
|
-
)
|
|
252
|
-
if not self.connector_config.uncompress:
|
|
253
|
-
return docs
|
|
254
|
-
for compressed_file in compressed_files:
|
|
255
|
-
compressed_doc = self.ingest_doc_cls(
|
|
256
|
-
read_config=self.read_config,
|
|
257
|
-
processor_config=self.processor_config,
|
|
258
|
-
connector_config=self.connector_config,
|
|
259
|
-
remote_file_path=compressed_file,
|
|
260
|
-
)
|
|
261
|
-
try:
|
|
262
|
-
local_ingest_docs = self.process_compressed_doc(doc=compressed_doc)
|
|
263
|
-
logger.info(f"adding {len(local_ingest_docs)} from {compressed_file}")
|
|
264
|
-
docs.extend(local_ingest_docs)
|
|
265
|
-
finally:
|
|
266
|
-
compressed_doc.cleanup_file()
|
|
267
|
-
return docs
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
@dataclass
|
|
271
|
-
class WriteTextConfig(EnhancedDataClassJsonMixin, ABC):
|
|
272
|
-
pass
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
@dataclass
|
|
276
|
-
class FsspecWriteConfig(WriteConfig):
|
|
277
|
-
write_text_config: t.Optional[WriteTextConfig] = None
|
|
278
|
-
|
|
279
|
-
def get_write_text_config(self) -> t.Dict[str, t.Any]:
|
|
280
|
-
if write_text_kwargs := self.write_text_config:
|
|
281
|
-
return write_text_kwargs.to_dict()
|
|
282
|
-
return {}
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
@dataclass
|
|
286
|
-
class FsspecDestinationConnector(BaseDestinationConnector):
|
|
287
|
-
connector_config: SimpleFsspecConfig
|
|
288
|
-
write_config: FsspecWriteConfig
|
|
289
|
-
|
|
290
|
-
def initialize(self):
|
|
291
|
-
from fsspec import AbstractFileSystem, get_filesystem_class
|
|
292
|
-
|
|
293
|
-
self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
|
|
294
|
-
**self.connector_config.get_access_config(),
|
|
295
|
-
)
|
|
296
|
-
self.check_connection()
|
|
297
|
-
|
|
298
|
-
def check_connection(self):
|
|
299
|
-
from fsspec import AbstractFileSystem, get_filesystem_class
|
|
300
|
-
|
|
301
|
-
try:
|
|
302
|
-
fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
|
|
303
|
-
**self.connector_config.get_access_config(),
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
# e.g. Dropbox path starts with /
|
|
307
|
-
bucket_name = "/" if self.connector_config.path_without_protocol.startswith("/") else ""
|
|
308
|
-
bucket_name += self.connector_config.dir_path.split("/")[0]
|
|
309
|
-
|
|
310
|
-
logger.info(f"checking connection for destination {bucket_name}")
|
|
311
|
-
fs.ls(path=bucket_name, detail=False)
|
|
312
|
-
except Exception as e:
|
|
313
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
314
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
315
|
-
|
|
316
|
-
def write_dict(
|
|
317
|
-
self,
|
|
318
|
-
*args,
|
|
319
|
-
elements_dict: t.List[t.Dict[str, t.Any]],
|
|
320
|
-
filename: t.Optional[str] = None,
|
|
321
|
-
indent: int = 4,
|
|
322
|
-
encoding: str = "utf-8",
|
|
323
|
-
**kwargs,
|
|
324
|
-
) -> None:
|
|
325
|
-
from fsspec import AbstractFileSystem, get_filesystem_class
|
|
326
|
-
|
|
327
|
-
fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
|
|
328
|
-
**self.connector_config.get_access_config(),
|
|
329
|
-
)
|
|
330
|
-
|
|
331
|
-
logger.info(f"writing content using filesystem: {type(fs).__name__}")
|
|
332
|
-
|
|
333
|
-
output_folder = self.connector_config.path_without_protocol
|
|
334
|
-
output_folder = os.path.join(output_folder) # Make sure folder ends with file separator
|
|
335
|
-
filename = (
|
|
336
|
-
filename.strip(os.sep) if filename else filename
|
|
337
|
-
) # Make sure filename doesn't begin with file separator
|
|
338
|
-
output_path = str(PurePath(output_folder, filename)) if filename else output_folder
|
|
339
|
-
full_output_path = f"{self.connector_config.protocol}://{output_path}"
|
|
340
|
-
logger.debug(f"uploading content to {full_output_path}")
|
|
341
|
-
write_text_configs = self.write_config.get_write_text_config() if self.write_config else {}
|
|
342
|
-
fs.write_text(
|
|
343
|
-
full_output_path,
|
|
344
|
-
json.dumps(elements_dict, indent=indent),
|
|
345
|
-
encoding=encoding,
|
|
346
|
-
**write_text_configs,
|
|
347
|
-
)
|
|
348
|
-
|
|
349
|
-
def get_elements_dict(self, docs: t.List[BaseSingleIngestDoc]) -> t.List[t.Dict[str, t.Any]]:
|
|
350
|
-
pass
|
|
351
|
-
|
|
352
|
-
def write(self, docs: t.List[BaseSingleIngestDoc]) -> None:
|
|
353
|
-
for doc in docs:
|
|
354
|
-
file_path = doc.base_output_filename
|
|
355
|
-
filename = file_path if file_path else None
|
|
356
|
-
with open(doc._output_filename) as json_file:
|
|
357
|
-
logger.debug(f"uploading content from {doc._output_filename}")
|
|
358
|
-
json_list = json.load(json_file)
|
|
359
|
-
self.write_dict(elements_dict=json_list, filename=filename)
|