unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import hashlib
|
|
3
|
+
import typing as t
|
|
4
|
+
import uuid
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from dataclasses_json.core import Json
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
|
+
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
12
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
13
|
+
from unstructured_ingest.interfaces import (
|
|
14
|
+
AccessConfig,
|
|
15
|
+
BaseConnectorConfig,
|
|
16
|
+
BaseDestinationConnector,
|
|
17
|
+
BaseIngestDocBatch,
|
|
18
|
+
BaseSingleIngestDoc,
|
|
19
|
+
BaseSourceConnector,
|
|
20
|
+
IngestDocCleanupMixin,
|
|
21
|
+
SourceConnectorCleanupMixin,
|
|
22
|
+
SourceMetadata,
|
|
23
|
+
WriteConfig,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.logger import logger
|
|
26
|
+
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
27
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
28
|
+
|
|
29
|
+
if t.TYPE_CHECKING:
|
|
30
|
+
from elasticsearch import Elasticsearch
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class ElasticsearchAccessConfig(AccessConfig):
|
|
35
|
+
hosts: t.Optional[t.List[str]] = None
|
|
36
|
+
username: t.Optional[str] = None
|
|
37
|
+
password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
38
|
+
cloud_id: t.Optional[str] = None
|
|
39
|
+
api_key: t.Optional[str] = enhanced_field(
|
|
40
|
+
default=None, sensitive=True, overload_name="es_api_key"
|
|
41
|
+
)
|
|
42
|
+
api_key_id: t.Optional[str] = None
|
|
43
|
+
bearer_auth: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
44
|
+
ca_certs: t.Optional[str] = None
|
|
45
|
+
ssl_assert_fingerprint: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
46
|
+
|
|
47
|
+
def to_dict(self, **kwargs) -> t.Dict[str, Json]:
|
|
48
|
+
d = super().to_dict(**kwargs)
|
|
49
|
+
# Update auth related fields to conform to what the SDK expects based on the
|
|
50
|
+
# supported methods:
|
|
51
|
+
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
|
|
52
|
+
if not self.ca_certs:
|
|
53
|
+
# ES library already sets a default for this, don't want to
|
|
54
|
+
# introduce data by setting it to None
|
|
55
|
+
d.pop("ca_certs")
|
|
56
|
+
if self.password and (self.cloud_id or self.ca_certs or self.ssl_assert_fingerprint):
|
|
57
|
+
d.pop("password")
|
|
58
|
+
d["basic_auth"] = ("elastic", self.password)
|
|
59
|
+
elif not self.cloud_id and self.username and self.password:
|
|
60
|
+
d.pop("username", None)
|
|
61
|
+
d.pop("password", None)
|
|
62
|
+
d["basic_auth"] = (self.username, self.password)
|
|
63
|
+
elif self.api_key and self.api_key_id:
|
|
64
|
+
d.pop("api_key_id", None)
|
|
65
|
+
d.pop("api_key", None)
|
|
66
|
+
d["api_key"] = (self.api_key_id, self.api_key)
|
|
67
|
+
# This doesn't exist on the client init, remove:
|
|
68
|
+
d.pop("api_key_id", None)
|
|
69
|
+
return d
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class SimpleElasticsearchConfig(BaseConnectorConfig):
|
|
74
|
+
"""Connector config where:
|
|
75
|
+
url is the url to access the elasticsearch server,
|
|
76
|
+
index_name is the name of the index to reach to,
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
index_name: str
|
|
80
|
+
batch_size: int = 100
|
|
81
|
+
fields: t.List[str] = field(default_factory=list)
|
|
82
|
+
access_config: ElasticsearchAccessConfig = None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class ElasticsearchDocumentMeta:
|
|
87
|
+
"""Metadata specifying:
|
|
88
|
+
name of the elasticsearch index that is being reached to,
|
|
89
|
+
and the id of document that is being reached to,
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
index_name: str
|
|
93
|
+
document_id: str
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
98
|
+
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
99
|
+
doing the processing!).
|
|
100
|
+
|
|
101
|
+
Current implementation creates a python Elasticsearch client to fetch each doc,
|
|
102
|
+
rather than creating a client for each thread.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
connector_config: SimpleElasticsearchConfig
|
|
106
|
+
document_meta: ElasticsearchDocumentMeta
|
|
107
|
+
document: dict = field(default_factory=dict)
|
|
108
|
+
registry_name: str = "elasticsearch"
|
|
109
|
+
|
|
110
|
+
# TODO: remove one of filename or _tmp_download_file, using a wrapper
|
|
111
|
+
@property
|
|
112
|
+
def filename(self):
|
|
113
|
+
f = self.document_meta.document_id
|
|
114
|
+
if self.connector_config.fields:
|
|
115
|
+
f = "{}-{}".format(
|
|
116
|
+
f,
|
|
117
|
+
hashlib.sha256(",".join(self.connector_config.fields).encode()).hexdigest()[:8],
|
|
118
|
+
)
|
|
119
|
+
return (
|
|
120
|
+
Path(self.read_config.download_dir) / self.document_meta.index_name / f"{f}.txt"
|
|
121
|
+
).resolve()
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def _output_filename(self):
|
|
125
|
+
"""Create filename document id combined with a hash of the query to uniquely identify
|
|
126
|
+
the output file."""
|
|
127
|
+
# Generate SHA256 hash and take the first 8 characters
|
|
128
|
+
filename = self.document_meta.document_id
|
|
129
|
+
if self.connector_config.fields:
|
|
130
|
+
filename = "{}-{}".format(
|
|
131
|
+
filename,
|
|
132
|
+
hashlib.sha256(",".join(self.connector_config.fields).encode()).hexdigest()[:8],
|
|
133
|
+
)
|
|
134
|
+
output_file = f"{filename}.json"
|
|
135
|
+
return (
|
|
136
|
+
Path(self.processor_config.output_dir) / self.connector_config.index_name / output_file
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def update_source_metadata(self, **kwargs):
|
|
140
|
+
if self.document is None:
|
|
141
|
+
self.source_metadata = SourceMetadata(
|
|
142
|
+
exists=False,
|
|
143
|
+
)
|
|
144
|
+
return
|
|
145
|
+
self.source_metadata = SourceMetadata(
|
|
146
|
+
version=self.document["_version"],
|
|
147
|
+
exists=True,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
@SourceConnectionError.wrap
|
|
151
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
152
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
153
|
+
def get_file(self):
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def date_created(self) -> t.Optional[str]:
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def date_modified(self) -> t.Optional[str]:
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def source_url(self) -> t.Optional[str]:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
170
|
+
return {
|
|
171
|
+
"hosts": self.connector_config.access_config.hosts,
|
|
172
|
+
"index_name": self.connector_config.index_name,
|
|
173
|
+
"document_id": self.document_meta.document_id,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@dataclass
|
|
178
|
+
class ElasticsearchIngestDocBatch(BaseIngestDocBatch):
|
|
179
|
+
connector_config: SimpleElasticsearchConfig
|
|
180
|
+
ingest_docs: t.List[ElasticsearchIngestDoc] = field(default_factory=list)
|
|
181
|
+
list_of_ids: t.List[str] = field(default_factory=list)
|
|
182
|
+
registry_name: str = "elasticsearch_batch"
|
|
183
|
+
|
|
184
|
+
def __post_init__(self):
|
|
185
|
+
# Until python3.8 is deprecated, this is a limitation of dataclass inheritance
|
|
186
|
+
# to make it a required field
|
|
187
|
+
if len(self.list_of_ids) == 0:
|
|
188
|
+
raise ValueError("list_of_ids is required")
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def unique_id(self) -> str:
|
|
192
|
+
return ",".join(sorted(self.list_of_ids))
|
|
193
|
+
|
|
194
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
195
|
+
def _get_docs(self):
|
|
196
|
+
from elasticsearch import Elasticsearch
|
|
197
|
+
from elasticsearch.helpers import scan
|
|
198
|
+
|
|
199
|
+
es = Elasticsearch(**self.connector_config.access_config.to_dict(apply_name_overload=False))
|
|
200
|
+
scan_query = {
|
|
201
|
+
"_source": self.connector_config.fields,
|
|
202
|
+
"version": True,
|
|
203
|
+
"query": {"ids": {"values": self.list_of_ids}},
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
result = scan(
|
|
207
|
+
es,
|
|
208
|
+
query=scan_query,
|
|
209
|
+
scroll="1m",
|
|
210
|
+
index=self.connector_config.index_name,
|
|
211
|
+
)
|
|
212
|
+
return list(result)
|
|
213
|
+
|
|
214
|
+
@SourceConnectionError.wrap
|
|
215
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
216
|
+
def get_files(self):
|
|
217
|
+
documents = self._get_docs()
|
|
218
|
+
for doc in documents:
|
|
219
|
+
ingest_doc = ElasticsearchIngestDoc(
|
|
220
|
+
processor_config=self.processor_config,
|
|
221
|
+
read_config=self.read_config,
|
|
222
|
+
connector_config=self.connector_config,
|
|
223
|
+
document=doc,
|
|
224
|
+
document_meta=ElasticsearchDocumentMeta(
|
|
225
|
+
self.connector_config.index_name, doc["_id"]
|
|
226
|
+
),
|
|
227
|
+
)
|
|
228
|
+
ingest_doc.update_source_metadata()
|
|
229
|
+
doc_body = doc["_source"]
|
|
230
|
+
filename = ingest_doc.filename
|
|
231
|
+
flattened_dict = flatten_dict(dictionary=doc_body)
|
|
232
|
+
str_values = [str(value) for value in flattened_dict.values()]
|
|
233
|
+
concatenated_values = "\n".join(str_values)
|
|
234
|
+
|
|
235
|
+
filename.parent.mkdir(parents=True, exist_ok=True)
|
|
236
|
+
with open(filename, "w", encoding="utf8") as f:
|
|
237
|
+
f.write(concatenated_values)
|
|
238
|
+
self.ingest_docs.append(ingest_doc)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@dataclass
|
|
242
|
+
class ElasticsearchSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
243
|
+
"""Fetches particular fields from all documents in a given elasticsearch cluster and index"""
|
|
244
|
+
|
|
245
|
+
connector_config: SimpleElasticsearchConfig
|
|
246
|
+
_es: t.Optional["Elasticsearch"] = field(init=False, default=None)
|
|
247
|
+
|
|
248
|
+
@property
|
|
249
|
+
def es(self):
|
|
250
|
+
from elasticsearch import Elasticsearch
|
|
251
|
+
|
|
252
|
+
if self._es is None:
|
|
253
|
+
self._es = Elasticsearch(
|
|
254
|
+
**self.connector_config.access_config.to_dict(apply_name_overload=False)
|
|
255
|
+
)
|
|
256
|
+
return self._es
|
|
257
|
+
|
|
258
|
+
def check_connection(self):
|
|
259
|
+
try:
|
|
260
|
+
self.es.perform_request("HEAD", "/", headers={"accept": "application/json"})
|
|
261
|
+
except Exception as e:
|
|
262
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
263
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
264
|
+
|
|
265
|
+
def __post_init__(self):
|
|
266
|
+
self.scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
|
|
267
|
+
|
|
268
|
+
def initialize(self):
|
|
269
|
+
pass
|
|
270
|
+
|
|
271
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
272
|
+
def _get_doc_ids(self):
|
|
273
|
+
"""Fetches all document ids in an index"""
|
|
274
|
+
from elasticsearch.helpers import scan
|
|
275
|
+
|
|
276
|
+
hits = scan(
|
|
277
|
+
self.es,
|
|
278
|
+
query=self.scan_query,
|
|
279
|
+
scroll="1m",
|
|
280
|
+
index=self.connector_config.index_name,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
return [hit["_id"] for hit in hits]
|
|
284
|
+
|
|
285
|
+
def get_ingest_docs(self):
|
|
286
|
+
"""Fetches all documents in an index, using ids that are fetched with _get_doc_ids"""
|
|
287
|
+
ids = self._get_doc_ids()
|
|
288
|
+
id_batches = [
|
|
289
|
+
ids[
|
|
290
|
+
i
|
|
291
|
+
* self.connector_config.batch_size : (i + 1) # noqa
|
|
292
|
+
* self.connector_config.batch_size
|
|
293
|
+
]
|
|
294
|
+
for i in range(
|
|
295
|
+
(len(ids) + self.connector_config.batch_size - 1)
|
|
296
|
+
// self.connector_config.batch_size
|
|
297
|
+
)
|
|
298
|
+
]
|
|
299
|
+
return [
|
|
300
|
+
ElasticsearchIngestDocBatch(
|
|
301
|
+
connector_config=self.connector_config,
|
|
302
|
+
processor_config=self.processor_config,
|
|
303
|
+
read_config=self.read_config,
|
|
304
|
+
list_of_ids=batched_ids,
|
|
305
|
+
)
|
|
306
|
+
for batched_ids in id_batches
|
|
307
|
+
]
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
@dataclass
|
|
311
|
+
class ElasticsearchWriteConfig(WriteConfig):
|
|
312
|
+
batch_size_bytes: int = 15_000_000
|
|
313
|
+
num_processes: int = 1
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@dataclass
|
|
317
|
+
class ElasticsearchDestinationConnector(BaseDestinationConnector):
|
|
318
|
+
write_config: ElasticsearchWriteConfig
|
|
319
|
+
connector_config: SimpleElasticsearchConfig
|
|
320
|
+
_client: t.Optional["Elasticsearch"] = field(init=False, default=None)
|
|
321
|
+
|
|
322
|
+
def to_dict(self, **kwargs):
|
|
323
|
+
"""
|
|
324
|
+
The _client variable in this dataclass breaks deepcopy due to:
|
|
325
|
+
TypeError: cannot pickle '_thread.lock' object
|
|
326
|
+
When serializing, remove it, meaning client data will need to be reinitialized
|
|
327
|
+
when deserialized
|
|
328
|
+
"""
|
|
329
|
+
self_cp = copy.copy(self)
|
|
330
|
+
if hasattr(self_cp, "_client"):
|
|
331
|
+
setattr(self_cp, "_client", None)
|
|
332
|
+
return _asdict(self_cp, **kwargs)
|
|
333
|
+
|
|
334
|
+
@DestinationConnectionError.wrap
|
|
335
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
336
|
+
def generate_client(self) -> "Elasticsearch":
|
|
337
|
+
from elasticsearch import Elasticsearch
|
|
338
|
+
|
|
339
|
+
return Elasticsearch(
|
|
340
|
+
**self.connector_config.access_config.to_dict(apply_name_overload=False)
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
@property
|
|
344
|
+
def client(self):
|
|
345
|
+
if self._client is None:
|
|
346
|
+
self._client = self.generate_client()
|
|
347
|
+
return self._client
|
|
348
|
+
|
|
349
|
+
def initialize(self):
|
|
350
|
+
_ = self.client
|
|
351
|
+
|
|
352
|
+
@DestinationConnectionError.wrap
|
|
353
|
+
def check_connection(self):
|
|
354
|
+
try:
|
|
355
|
+
assert self.client.ping()
|
|
356
|
+
except Exception as e:
|
|
357
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
358
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
359
|
+
|
|
360
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
361
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
362
|
+
logger.info(
|
|
363
|
+
f"writing document batches to destination"
|
|
364
|
+
f" index named {self.connector_config.index_name}"
|
|
365
|
+
f" at {self.connector_config.access_config.hosts}"
|
|
366
|
+
f" with batch size (in bytes) {self.write_config.batch_size_bytes}"
|
|
367
|
+
f" with {self.write_config.num_processes} (number of) processes"
|
|
368
|
+
)
|
|
369
|
+
from elasticsearch.helpers import parallel_bulk
|
|
370
|
+
|
|
371
|
+
for batch in generator_batching_wbytes(
|
|
372
|
+
elements_dict, batch_size_limit_bytes=self.write_config.batch_size_bytes
|
|
373
|
+
):
|
|
374
|
+
for success, info in parallel_bulk(
|
|
375
|
+
self.client, batch, thread_count=self.write_config.num_processes
|
|
376
|
+
):
|
|
377
|
+
if not success:
|
|
378
|
+
logger.error(
|
|
379
|
+
"upload failed for a batch in elasticsearch destination connector:", info
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
def normalize_dict(self, element_dict: dict) -> dict:
|
|
383
|
+
return {
|
|
384
|
+
"_index": self.connector_config.index_name,
|
|
385
|
+
"_id": str(uuid.uuid4()),
|
|
386
|
+
"_source": {
|
|
387
|
+
"element_id": element_dict.pop("element_id", None),
|
|
388
|
+
"embeddings": element_dict.pop("embeddings", None),
|
|
389
|
+
"text": element_dict.pop("text", None),
|
|
390
|
+
"type": element_dict.pop("type", None),
|
|
391
|
+
"metadata": flatten_dict(
|
|
392
|
+
element_dict.pop("metadata", None),
|
|
393
|
+
separator="-",
|
|
394
|
+
),
|
|
395
|
+
},
|
|
396
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.connector.fsspec.fsspec import (
|
|
5
|
+
FsspecDestinationConnector,
|
|
6
|
+
FsspecIngestDoc,
|
|
7
|
+
FsspecSourceConnector,
|
|
8
|
+
FsspecWriteConfig,
|
|
9
|
+
SimpleFsspecConfig,
|
|
10
|
+
WriteTextConfig,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
13
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
14
|
+
from unstructured_ingest.interfaces import AccessConfig
|
|
15
|
+
from unstructured_ingest.logger import logger
|
|
16
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class AzureWriteTextConfig(WriteTextConfig):
|
|
21
|
+
overwrite: bool = False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class AzureWriteConfig(FsspecWriteConfig):
|
|
26
|
+
write_text_config: t.Optional[AzureWriteTextConfig] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class AzureAccessConfig(AccessConfig):
|
|
31
|
+
account_name: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
32
|
+
account_key: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
33
|
+
connection_string: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
34
|
+
sas_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class SimpleAzureBlobStorageConfig(SimpleFsspecConfig):
|
|
39
|
+
access_config: AzureAccessConfig = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class AzureBlobStorageIngestDoc(FsspecIngestDoc):
|
|
44
|
+
connector_config: SimpleAzureBlobStorageConfig
|
|
45
|
+
registry_name: str = "azure"
|
|
46
|
+
|
|
47
|
+
@SourceConnectionError.wrap
|
|
48
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
49
|
+
def get_file(self):
|
|
50
|
+
super().get_file()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class AzureBlobStorageSourceConnector(FsspecSourceConnector):
|
|
55
|
+
connector_config: SimpleAzureBlobStorageConfig
|
|
56
|
+
|
|
57
|
+
def __post_init__(self):
|
|
58
|
+
self.ingest_doc_cls: t.Type[AzureBlobStorageIngestDoc] = AzureBlobStorageIngestDoc
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class AzureBlobStorageDestinationConnector(FsspecDestinationConnector):
|
|
63
|
+
connector_config: SimpleAzureBlobStorageConfig
|
|
64
|
+
write_config: AzureWriteConfig
|
|
65
|
+
|
|
66
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
67
|
+
def initialize(self):
|
|
68
|
+
super().initialize()
|
|
69
|
+
|
|
70
|
+
@requires_dependencies(["adlfs"], extras="azure")
|
|
71
|
+
def check_connection(self):
|
|
72
|
+
from adlfs import AzureBlobFileSystem
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
AzureBlobFileSystem(**self.connector_config.get_access_config())
|
|
76
|
+
except ValueError as connection_error:
|
|
77
|
+
logger.error(f"failed to validate connection: {connection_error}", exc_info=True)
|
|
78
|
+
raise DestinationConnectionError(f"failed to validate connection: {connection_error}")
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Box Connector
|
|
3
|
+
Box does not make it simple to download files with an App.
|
|
4
|
+
First of all, this does not work with a free Box account.
|
|
5
|
+
Make sure the App service email is a collaborator for your folder (co-owner or editor)
|
|
6
|
+
Make sure you have the 'write all files' application scope
|
|
7
|
+
Maybe check 'Make api calls as the as-user header'
|
|
8
|
+
REAUTHORIZE app after making any of the above changes
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import typing as t
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
from unstructured_ingest.connector.fsspec.fsspec import (
|
|
15
|
+
FsspecDestinationConnector,
|
|
16
|
+
FsspecIngestDoc,
|
|
17
|
+
FsspecSourceConnector,
|
|
18
|
+
FsspecWriteConfig,
|
|
19
|
+
SimpleFsspecConfig,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
22
|
+
from unstructured_ingest.interfaces import AccessConfig
|
|
23
|
+
from unstructured_ingest.logger import logger
|
|
24
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AccessTokenError(Exception):
|
|
28
|
+
"""There is a problem with the Access Token."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class BoxWriteConfig(FsspecWriteConfig):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class BoxAccessConfig(AccessConfig):
|
|
38
|
+
box_app_config: t.Optional[str] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class SimpleBoxConfig(SimpleFsspecConfig):
|
|
43
|
+
access_config: BoxAccessConfig = None
|
|
44
|
+
|
|
45
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
46
|
+
def get_access_config(self) -> dict:
|
|
47
|
+
# Return access_kwargs with oauth. The oauth object can not be stored directly in the config
|
|
48
|
+
# because it is not serializable.
|
|
49
|
+
from boxsdk import JWTAuth
|
|
50
|
+
|
|
51
|
+
access_kwargs_with_oauth: dict[str, t.Any] = {
|
|
52
|
+
"oauth": JWTAuth.from_settings_file(
|
|
53
|
+
self.access_config.box_app_config,
|
|
54
|
+
),
|
|
55
|
+
}
|
|
56
|
+
access_config: dict[str, t.Any] = self.access_config.to_dict()
|
|
57
|
+
access_config.pop("box_app_config", None)
|
|
58
|
+
access_kwargs_with_oauth.update(access_config)
|
|
59
|
+
|
|
60
|
+
return access_kwargs_with_oauth
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class BoxIngestDoc(FsspecIngestDoc):
|
|
65
|
+
connector_config: SimpleBoxConfig
|
|
66
|
+
registry_name: str = "box"
|
|
67
|
+
|
|
68
|
+
@SourceConnectionError.wrap
|
|
69
|
+
@requires_dependencies(["boxfs", "fsspec"], extras="box")
|
|
70
|
+
def get_file(self):
|
|
71
|
+
super().get_file()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class BoxSourceConnector(FsspecSourceConnector):
|
|
76
|
+
connector_config: SimpleBoxConfig
|
|
77
|
+
|
|
78
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
79
|
+
def check_connection(self):
|
|
80
|
+
from boxfs import BoxFileSystem
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
BoxFileSystem(**self.connector_config.get_access_config())
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
86
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
87
|
+
|
|
88
|
+
def __post_init__(self):
|
|
89
|
+
self.ingest_doc_cls: t.Type[BoxIngestDoc] = BoxIngestDoc
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class BoxDestinationConnector(FsspecDestinationConnector):
|
|
94
|
+
connector_config: SimpleBoxConfig
|
|
95
|
+
write_config: BoxWriteConfig
|
|
96
|
+
|
|
97
|
+
@requires_dependencies(["boxfs", "fsspec"], extras="box")
|
|
98
|
+
def initialize(self):
|
|
99
|
+
super().initialize()
|
|
100
|
+
|
|
101
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
102
|
+
def check_connection(self):
|
|
103
|
+
from boxfs import BoxFileSystem
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
BoxFileSystem(**self.connector_config.get_access_config())
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
109
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|