unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Generator, Optional
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
11
|
+
DestinationRegistryEntry,
|
|
12
|
+
SourceRegistryEntry,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
15
|
+
FsspecAccessConfig,
|
|
16
|
+
FsspecConnectionConfig,
|
|
17
|
+
FsspecDownloader,
|
|
18
|
+
FsspecDownloaderConfig,
|
|
19
|
+
FsspecIndexer,
|
|
20
|
+
FsspecIndexerConfig,
|
|
21
|
+
FsspecUploader,
|
|
22
|
+
FsspecUploaderConfig,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
CONNECTOR_TYPE = "box"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class BoxIndexerConfig(FsspecIndexerConfig):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class BoxAccessConfig(FsspecAccessConfig):
|
|
35
|
+
box_app_config: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class BoxConnectionConfig(FsspecConnectionConfig):
|
|
40
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["box"])
|
|
41
|
+
access_config: BoxAccessConfig = enhanced_field(
|
|
42
|
+
sensitive=True, default_factory=lambda: BoxAccessConfig()
|
|
43
|
+
)
|
|
44
|
+
connector_type: str = CONNECTOR_TYPE
|
|
45
|
+
|
|
46
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
47
|
+
# Return access_kwargs with oauth. The oauth object can not be stored directly in the config
|
|
48
|
+
# because it is not serializable.
|
|
49
|
+
from boxsdk import JWTAuth
|
|
50
|
+
|
|
51
|
+
access_kwargs_with_oauth: dict[str, Any] = {
|
|
52
|
+
"oauth": JWTAuth.from_settings_file(
|
|
53
|
+
self.access_config.box_app_config,
|
|
54
|
+
),
|
|
55
|
+
}
|
|
56
|
+
access_config: dict[str, Any] = self.access_config.to_dict()
|
|
57
|
+
access_config.pop("box_app_config", None)
|
|
58
|
+
access_kwargs_with_oauth.update(access_config)
|
|
59
|
+
|
|
60
|
+
return access_kwargs_with_oauth
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class BoxIndexer(FsspecIndexer):
|
|
65
|
+
connection_config: BoxConnectionConfig
|
|
66
|
+
index_config: BoxIndexerConfig
|
|
67
|
+
connector_type: str = CONNECTOR_TYPE
|
|
68
|
+
|
|
69
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
70
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
71
|
+
return super().run(**kwargs)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class BoxDownloaderConfig(FsspecDownloaderConfig):
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class BoxDownloader(FsspecDownloader):
|
|
81
|
+
protocol: str = "box"
|
|
82
|
+
connection_config: BoxConnectionConfig
|
|
83
|
+
connector_type: str = CONNECTOR_TYPE
|
|
84
|
+
download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
|
|
85
|
+
|
|
86
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
87
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
88
|
+
return super().run(file_data=file_data, **kwargs)
|
|
89
|
+
|
|
90
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
91
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
92
|
+
return await super().run_async(file_data=file_data, **kwargs)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class BoxUploaderConfig(FsspecUploaderConfig):
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class BoxUploader(FsspecUploader):
|
|
102
|
+
connector_type: str = CONNECTOR_TYPE
|
|
103
|
+
connection_config: BoxConnectionConfig
|
|
104
|
+
upload_config: BoxUploaderConfig = field(default=None)
|
|
105
|
+
|
|
106
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
107
|
+
def __post_init__(self):
|
|
108
|
+
super().__post_init__()
|
|
109
|
+
|
|
110
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
111
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
112
|
+
return super().run(contents=contents, **kwargs)
|
|
113
|
+
|
|
114
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
115
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
116
|
+
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
box_source_entry = SourceRegistryEntry(
|
|
120
|
+
indexer=BoxIndexer,
|
|
121
|
+
indexer_config=BoxIndexerConfig,
|
|
122
|
+
downloader=BoxDownloader,
|
|
123
|
+
downloader_config=BoxDownloaderConfig,
|
|
124
|
+
connection_config=BoxConnectionConfig,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
box_destination_entry = DestinationRegistryEntry(
|
|
128
|
+
uploader=BoxUploader,
|
|
129
|
+
uploader_config=BoxUploaderConfig,
|
|
130
|
+
connection_config=BoxConnectionConfig,
|
|
131
|
+
)
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Generator, Optional
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
11
|
+
DestinationRegistryEntry,
|
|
12
|
+
SourceRegistryEntry,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
15
|
+
FsspecAccessConfig,
|
|
16
|
+
FsspecConnectionConfig,
|
|
17
|
+
FsspecDownloader,
|
|
18
|
+
FsspecDownloaderConfig,
|
|
19
|
+
FsspecIndexer,
|
|
20
|
+
FsspecIndexerConfig,
|
|
21
|
+
FsspecUploader,
|
|
22
|
+
FsspecUploaderConfig,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
|
|
25
|
+
|
|
26
|
+
CONNECTOR_TYPE = "dropbox"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class DropboxIndexerConfig(FsspecIndexerConfig):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class DropboxAccessConfig(FsspecAccessConfig):
|
|
36
|
+
token: Optional[str] = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
41
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"])
|
|
42
|
+
access_config: DropboxAccessConfig = enhanced_field(
|
|
43
|
+
sensitive=True, default_factory=lambda: DropboxAccessConfig()
|
|
44
|
+
)
|
|
45
|
+
connector_type: str = CONNECTOR_TYPE
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class DropboxIndexer(FsspecIndexer):
|
|
50
|
+
connection_config: DropboxConnectionConfig
|
|
51
|
+
index_config: DropboxIndexerConfig
|
|
52
|
+
connector_type: str = CONNECTOR_TYPE
|
|
53
|
+
|
|
54
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
55
|
+
def __post_init__(self):
|
|
56
|
+
# dropbox expects the path to start with a /
|
|
57
|
+
if not self.index_config.path_without_protocol.startswith("/"):
|
|
58
|
+
self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
|
|
59
|
+
|
|
60
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
61
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
62
|
+
return super().run(**kwargs)
|
|
63
|
+
|
|
64
|
+
def sterilize_info(self, path) -> dict:
|
|
65
|
+
# the fs.info method defined in the dropboxdrivefs library expects a "url"
|
|
66
|
+
# kwarg rather than "path"; though both refer to the same thing
|
|
67
|
+
info = self.fs.info(url=path)
|
|
68
|
+
return sterilize_dict(data=info)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class DropboxDownloaderConfig(FsspecDownloaderConfig):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class DropboxDownloader(FsspecDownloader):
|
|
78
|
+
protocol: str = "dropbox"
|
|
79
|
+
connection_config: DropboxConnectionConfig
|
|
80
|
+
connector_type: str = CONNECTOR_TYPE
|
|
81
|
+
download_config: Optional[DropboxDownloaderConfig] = field(
|
|
82
|
+
default_factory=DropboxDownloaderConfig
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
86
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
87
|
+
return super().run(file_data=file_data, **kwargs)
|
|
88
|
+
|
|
89
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
90
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
91
|
+
return await super().run_async(file_data=file_data, **kwargs)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class DropboxUploaderConfig(FsspecUploaderConfig):
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class DropboxUploader(FsspecUploader):
|
|
101
|
+
connector_type: str = CONNECTOR_TYPE
|
|
102
|
+
connection_config: DropboxConnectionConfig
|
|
103
|
+
upload_config: DropboxUploaderConfig = field(default=None)
|
|
104
|
+
|
|
105
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
106
|
+
def __post_init__(self):
|
|
107
|
+
super().__post_init__()
|
|
108
|
+
|
|
109
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
110
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
111
|
+
return super().run(contents=contents, **kwargs)
|
|
112
|
+
|
|
113
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
114
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
115
|
+
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
dropbox_source_entry = SourceRegistryEntry(
|
|
119
|
+
indexer=DropboxIndexer,
|
|
120
|
+
indexer_config=DropboxIndexerConfig,
|
|
121
|
+
downloader=DropboxDownloader,
|
|
122
|
+
downloader_config=DropboxDownloaderConfig,
|
|
123
|
+
connection_config=DropboxConnectionConfig,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
dropbox_destination_entry = DestinationRegistryEntry(
|
|
127
|
+
uploader=DropboxUploader,
|
|
128
|
+
uploader_config=DropboxUploaderConfig,
|
|
129
|
+
connection_config=DropboxConnectionConfig,
|
|
130
|
+
)
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import fnmatch
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from time import time
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
10
|
+
|
|
11
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
12
|
+
|
|
13
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
14
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
15
|
+
from unstructured_ingest.v2.interfaces import (
|
|
16
|
+
AccessConfig,
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
Downloader,
|
|
19
|
+
DownloaderConfig,
|
|
20
|
+
DownloadResponse,
|
|
21
|
+
FileData,
|
|
22
|
+
Indexer,
|
|
23
|
+
IndexerConfig,
|
|
24
|
+
SourceIdentifiers,
|
|
25
|
+
UploadContent,
|
|
26
|
+
Uploader,
|
|
27
|
+
UploaderConfig,
|
|
28
|
+
)
|
|
29
|
+
from unstructured_ingest.v2.logger import logger
|
|
30
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from fsspec import AbstractFileSystem
|
|
34
|
+
|
|
35
|
+
CONNECTOR_TYPE = "fsspec"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Base(object):
|
|
39
|
+
def __post_init__(self):
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class FileConfig(Base):
|
|
45
|
+
remote_url: str
|
|
46
|
+
protocol: str = field(init=False)
|
|
47
|
+
path_without_protocol: str = field(init=False)
|
|
48
|
+
supported_protocols: list[str] = field(
|
|
49
|
+
default_factory=lambda: [
|
|
50
|
+
"s3",
|
|
51
|
+
"s3a",
|
|
52
|
+
"abfs",
|
|
53
|
+
"az",
|
|
54
|
+
"gs",
|
|
55
|
+
"gcs",
|
|
56
|
+
"box",
|
|
57
|
+
"dropbox",
|
|
58
|
+
"sftp",
|
|
59
|
+
]
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def __post_init__(self):
|
|
63
|
+
super().__post_init__()
|
|
64
|
+
self.protocol, self.path_without_protocol = self.remote_url.split("://")
|
|
65
|
+
if self.protocol not in self.supported_protocols:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"Protocol {} not supported yet, only {} are supported.".format(
|
|
68
|
+
self.protocol, ", ".join(self.supported_protocols)
|
|
69
|
+
),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
75
|
+
recursive: bool = False
|
|
76
|
+
file_glob: Optional[list[str]] = None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class FsspecAccessConfig(AccessConfig):
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
FsspecAccessConfigT = TypeVar("FsspecAccessConfigT", bound=FsspecAccessConfig)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class FsspecConnectionConfig(ConnectionConfig):
|
|
89
|
+
access_config: FsspecAccessConfigT = enhanced_field(sensitive=True, default=None)
|
|
90
|
+
connector_type: str = CONNECTOR_TYPE
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
|
|
94
|
+
FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnectionConfig)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass
|
|
98
|
+
class FsspecIndexer(Indexer):
|
|
99
|
+
connection_config: FsspecConnectionConfigT
|
|
100
|
+
index_config: FsspecIndexerConfigT
|
|
101
|
+
connector_type: str = CONNECTOR_TYPE
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def fs(self) -> "AbstractFileSystem":
|
|
105
|
+
from fsspec import get_filesystem_class
|
|
106
|
+
|
|
107
|
+
return get_filesystem_class(self.index_config.protocol)(
|
|
108
|
+
**self.connection_config.get_access_config(),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def does_path_match_glob(self, path: str) -> bool:
|
|
112
|
+
if self.index_config.file_glob is None:
|
|
113
|
+
return True
|
|
114
|
+
patterns = self.index_config.file_glob
|
|
115
|
+
for pattern in patterns:
|
|
116
|
+
if fnmatch.filter([path], pattern):
|
|
117
|
+
return True
|
|
118
|
+
logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
def check_connection(self):
|
|
122
|
+
from fsspec import get_filesystem_class
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
fs = get_filesystem_class(self.index_config.protocol)(
|
|
126
|
+
**self.connection_config.get_access_config(),
|
|
127
|
+
)
|
|
128
|
+
fs.ls(path=self.index_config.path_without_protocol, detail=False)
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
132
|
+
|
|
133
|
+
def list_files(self) -> list[str]:
|
|
134
|
+
if not self.index_config.recursive:
|
|
135
|
+
# fs.ls does not walk directories
|
|
136
|
+
# directories that are listed in cloud storage can cause problems
|
|
137
|
+
# because they are seen as 0 byte files
|
|
138
|
+
found = self.fs.ls(self.index_config.path_without_protocol, detail=True)
|
|
139
|
+
if isinstance(found, list):
|
|
140
|
+
return [
|
|
141
|
+
x.get("name") for x in found if x.get("size") > 0 and x.get("type") == "file"
|
|
142
|
+
]
|
|
143
|
+
else:
|
|
144
|
+
raise TypeError(f"unhandled response type from ls: {type(found)}")
|
|
145
|
+
else:
|
|
146
|
+
# fs.find will recursively walk directories
|
|
147
|
+
# "size" is a common key for all the cloud protocols with fs
|
|
148
|
+
found = self.fs.find(
|
|
149
|
+
self.index_config.path_without_protocol,
|
|
150
|
+
detail=True,
|
|
151
|
+
)
|
|
152
|
+
if isinstance(found, dict):
|
|
153
|
+
return [
|
|
154
|
+
k for k, v in found.items() if v.get("size") > 0 and v.get("type") == "file"
|
|
155
|
+
]
|
|
156
|
+
else:
|
|
157
|
+
raise TypeError(f"unhandled response type from find: {type(found)}")
|
|
158
|
+
|
|
159
|
+
def get_metadata(self, path: str) -> DataSourceMetadata:
|
|
160
|
+
date_created = None
|
|
161
|
+
date_modified = None
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
created: Optional[Any] = self.fs.created(path)
|
|
165
|
+
if created:
|
|
166
|
+
if isinstance(created, datetime):
|
|
167
|
+
date_created = str(created.timestamp())
|
|
168
|
+
else:
|
|
169
|
+
date_created = str(created)
|
|
170
|
+
except NotImplementedError:
|
|
171
|
+
pass
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
modified: Optional[Any] = self.fs.modified(path)
|
|
175
|
+
if modified:
|
|
176
|
+
if isinstance(modified, datetime):
|
|
177
|
+
date_modified = str(modified.timestamp())
|
|
178
|
+
else:
|
|
179
|
+
date_modified = str(modified)
|
|
180
|
+
except NotImplementedError:
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
version = self.fs.checksum(path)
|
|
184
|
+
metadata: dict[str, str] = {}
|
|
185
|
+
with contextlib.suppress(AttributeError):
|
|
186
|
+
metadata = self.fs.metadata(path)
|
|
187
|
+
record_locator = {
|
|
188
|
+
"protocol": self.index_config.protocol,
|
|
189
|
+
"remote_file_path": self.index_config.remote_url,
|
|
190
|
+
}
|
|
191
|
+
if metadata:
|
|
192
|
+
record_locator["metadata"] = metadata
|
|
193
|
+
return DataSourceMetadata(
|
|
194
|
+
date_created=date_created,
|
|
195
|
+
date_modified=date_modified,
|
|
196
|
+
date_processed=str(time()),
|
|
197
|
+
version=str(version),
|
|
198
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
199
|
+
record_locator=record_locator,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
def sterilize_info(self, path) -> dict:
|
|
203
|
+
info = self.fs.info(path=path)
|
|
204
|
+
return sterilize_dict(data=info)
|
|
205
|
+
|
|
206
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
207
|
+
raw_files = self.list_files()
|
|
208
|
+
files = [f for f in raw_files if self.does_path_match_glob(f)]
|
|
209
|
+
for file in files:
|
|
210
|
+
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
211
|
+
# to get a valid relative path
|
|
212
|
+
rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
213
|
+
yield FileData(
|
|
214
|
+
identifier=file,
|
|
215
|
+
connector_type=self.connector_type,
|
|
216
|
+
source_identifiers=SourceIdentifiers(
|
|
217
|
+
filename=Path(file).name,
|
|
218
|
+
rel_path=rel_path or None,
|
|
219
|
+
fullpath=file,
|
|
220
|
+
),
|
|
221
|
+
metadata=self.get_metadata(path=file),
|
|
222
|
+
additional_metadata=self.sterilize_info(path=file),
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@dataclass
|
|
227
|
+
class FsspecDownloaderConfig(DownloaderConfig):
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
FsspecDownloaderConfigT = TypeVar("FsspecDownloaderConfigT", bound=FsspecDownloaderConfig)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
@dataclass
|
|
235
|
+
class FsspecDownloader(Downloader):
|
|
236
|
+
protocol: str
|
|
237
|
+
connection_config: FsspecConnectionConfigT
|
|
238
|
+
connector_type: str = CONNECTOR_TYPE
|
|
239
|
+
download_config: Optional[FsspecDownloaderConfigT] = field(
|
|
240
|
+
default_factory=lambda: FsspecDownloaderConfig()
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def is_async(self) -> bool:
|
|
244
|
+
return self.fs.async_impl
|
|
245
|
+
|
|
246
|
+
@property
|
|
247
|
+
def fs(self) -> "AbstractFileSystem":
|
|
248
|
+
from fsspec import get_filesystem_class
|
|
249
|
+
|
|
250
|
+
return get_filesystem_class(self.protocol)(
|
|
251
|
+
**self.connection_config.get_access_config(),
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
255
|
+
return (
|
|
256
|
+
self.download_dir / Path(file_data.source_identifiers.relative_path)
|
|
257
|
+
if self.download_config
|
|
258
|
+
else Path(file_data.source_identifiers.rel_path)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
262
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
263
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
264
|
+
try:
|
|
265
|
+
self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
268
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
269
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
270
|
+
|
|
271
|
+
async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
272
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
273
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
274
|
+
try:
|
|
275
|
+
await self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
278
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
279
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@dataclass
|
|
283
|
+
class FsspecUploaderConfig(FileConfig, UploaderConfig):
|
|
284
|
+
overwrite: bool = False
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@dataclass
|
|
291
|
+
class FsspecUploader(Uploader):
|
|
292
|
+
connector_type: str = CONNECTOR_TYPE
|
|
293
|
+
upload_config: FsspecUploaderConfigT = field(default=None)
|
|
294
|
+
|
|
295
|
+
@property
|
|
296
|
+
def fs(self) -> "AbstractFileSystem":
|
|
297
|
+
from fsspec import get_filesystem_class
|
|
298
|
+
|
|
299
|
+
fs_kwargs = self.connection_config.get_access_config() if self.connection_config else {}
|
|
300
|
+
return get_filesystem_class(self.upload_config.protocol)(
|
|
301
|
+
**fs_kwargs,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
def __post_init__(self):
|
|
305
|
+
# TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
|
|
306
|
+
if not self.upload_config:
|
|
307
|
+
raise TypeError(
|
|
308
|
+
f"{self.__class__.__name__}.__init__() "
|
|
309
|
+
f"missing 1 required positional argument: 'upload_config'"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
def get_upload_path(self, file_data: FileData) -> Path:
|
|
313
|
+
upload_path = (
|
|
314
|
+
Path(self.upload_config.path_without_protocol)
|
|
315
|
+
/ file_data.source_identifiers.relative_path
|
|
316
|
+
)
|
|
317
|
+
updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
|
|
318
|
+
return updated_upload_path
|
|
319
|
+
|
|
320
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
321
|
+
for content in contents:
|
|
322
|
+
self._run(path=content.path, file_data=content.file_data)
|
|
323
|
+
|
|
324
|
+
def _run(self, path: Path, file_data: FileData) -> None:
|
|
325
|
+
path_str = str(path.resolve())
|
|
326
|
+
upload_path = self.get_upload_path(file_data=file_data)
|
|
327
|
+
if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
|
|
328
|
+
logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
|
|
329
|
+
return
|
|
330
|
+
logger.debug(f"Writing local file {path_str} to {upload_path}")
|
|
331
|
+
self.fs.upload(lpath=path_str, rpath=str(upload_path))
|
|
332
|
+
|
|
333
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
334
|
+
upload_path = self.get_upload_path(file_data=file_data)
|
|
335
|
+
path_str = str(path.resolve())
|
|
336
|
+
# Odd that fsspec doesn't run exists() as async even when client support async
|
|
337
|
+
already_exists = self.fs.exists(path=str(upload_path))
|
|
338
|
+
if already_exists and not self.upload_config.overwrite:
|
|
339
|
+
logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
|
|
340
|
+
return
|
|
341
|
+
logger.debug(f"Writing local file {path_str} to {upload_path}")
|
|
342
|
+
self.fs.upload(lpath=path_str, rpath=str(upload_path))
|