unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import sys
|
|
4
|
+
import uuid
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from time import time
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
9
|
+
|
|
10
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
13
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
14
|
+
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
15
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
from unstructured_ingest.v2.interfaces import (
|
|
17
|
+
AccessConfig,
|
|
18
|
+
ConnectionConfig,
|
|
19
|
+
Downloader,
|
|
20
|
+
DownloaderConfig,
|
|
21
|
+
DownloadResponse,
|
|
22
|
+
FileData,
|
|
23
|
+
Indexer,
|
|
24
|
+
IndexerConfig,
|
|
25
|
+
UploadContent,
|
|
26
|
+
Uploader,
|
|
27
|
+
UploaderConfig,
|
|
28
|
+
UploadStager,
|
|
29
|
+
UploadStagerConfig,
|
|
30
|
+
download_responses,
|
|
31
|
+
)
|
|
32
|
+
from unstructured_ingest.v2.logger import logger
|
|
33
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
34
|
+
DestinationRegistryEntry,
|
|
35
|
+
SourceRegistryEntry,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from elasticsearch import Elasticsearch as ElasticsearchClient
|
|
40
|
+
|
|
41
|
+
CONNECTOR_TYPE = "elasticsearch"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class ElasticsearchAccessConfig(AccessConfig):
|
|
46
|
+
password: Optional[str] = None
|
|
47
|
+
api_key: Optional[str] = enhanced_field(default=None, overload_name="es_api_key")
|
|
48
|
+
bearer_auth: Optional[str] = None
|
|
49
|
+
ssl_assert_fingerprint: Optional[str] = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ElasticsearchClientInput(EnhancedDataClassJsonMixin):
|
|
54
|
+
hosts: Optional[list[str]] = None
|
|
55
|
+
cloud_id: Optional[str] = None
|
|
56
|
+
ca_certs: Optional[str] = None
|
|
57
|
+
basic_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None)
|
|
58
|
+
api_key: Optional[str] = enhanced_field(sensitive=True, default=None)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class ElasticsearchConnectionConfig(ConnectionConfig):
|
|
63
|
+
hosts: Optional[list[str]] = None
|
|
64
|
+
username: Optional[str] = None
|
|
65
|
+
cloud_id: Optional[str] = None
|
|
66
|
+
api_key_id: Optional[str] = None
|
|
67
|
+
ca_certs: Optional[str] = None
|
|
68
|
+
access_config: ElasticsearchAccessConfig = enhanced_field(sensitive=True)
|
|
69
|
+
|
|
70
|
+
def get_client_kwargs(self) -> dict:
|
|
71
|
+
# Update auth related fields to conform to what the SDK expects based on the
|
|
72
|
+
# supported methods:
|
|
73
|
+
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
|
|
74
|
+
client_input = ElasticsearchClientInput()
|
|
75
|
+
if self.hosts:
|
|
76
|
+
client_input.hosts = self.hosts
|
|
77
|
+
if self.cloud_id:
|
|
78
|
+
client_input.cloud_id = self.cloud_id
|
|
79
|
+
if self.ca_certs:
|
|
80
|
+
client_input.ca_certs = self.ca_certs
|
|
81
|
+
if self.access_config.password and (
|
|
82
|
+
self.cloud_id or self.ca_certs or self.access_config.ssl_assert_fingerprint
|
|
83
|
+
):
|
|
84
|
+
client_input.basic_auth = ("elastic", self.access_config.password)
|
|
85
|
+
elif not self.cloud_id and self.username and self.access_config.password:
|
|
86
|
+
client_input.basic_auth = (self.username, self.access_config.password)
|
|
87
|
+
elif self.access_config.api_key and self.api_key_id:
|
|
88
|
+
client_input.api_key = (self.api_key_id, self.access_config.api_key)
|
|
89
|
+
elif self.access_config.api_key:
|
|
90
|
+
client_input.api_key = self.access_config.api_key
|
|
91
|
+
logger.debug(
|
|
92
|
+
f"Elasticsearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}"
|
|
93
|
+
)
|
|
94
|
+
client_kwargs = client_input.to_dict(redact_sensitive=False)
|
|
95
|
+
client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
|
|
96
|
+
return client_kwargs
|
|
97
|
+
|
|
98
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
99
|
+
def get_client(self) -> "ElasticsearchClient":
|
|
100
|
+
from elasticsearch import Elasticsearch as ElasticsearchClient
|
|
101
|
+
|
|
102
|
+
client = ElasticsearchClient(**self.get_client_kwargs())
|
|
103
|
+
self.check_connection(client=client)
|
|
104
|
+
return client
|
|
105
|
+
|
|
106
|
+
def check_connection(self, client: "ElasticsearchClient"):
|
|
107
|
+
try:
|
|
108
|
+
client.perform_request("HEAD", "/", headers={"accept": "application/json"})
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
111
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class ElasticsearchIndexerConfig(IndexerConfig):
|
|
116
|
+
index_name: str
|
|
117
|
+
batch_size: int = 100
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class ElasticsearchIndexer(Indexer):
|
|
122
|
+
connection_config: ElasticsearchConnectionConfig
|
|
123
|
+
index_config: ElasticsearchIndexerConfig
|
|
124
|
+
client: "ElasticsearchClient" = field(init=False)
|
|
125
|
+
connector_type: str = CONNECTOR_TYPE
|
|
126
|
+
|
|
127
|
+
def __post_init__(self):
|
|
128
|
+
self.client = self.connection_config.get_client()
|
|
129
|
+
|
|
130
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
131
|
+
def load_scan(self):
|
|
132
|
+
from elasticsearch.helpers import scan
|
|
133
|
+
|
|
134
|
+
return scan
|
|
135
|
+
|
|
136
|
+
def _get_doc_ids(self) -> set[str]:
|
|
137
|
+
"""Fetches all document ids in an index"""
|
|
138
|
+
scan = self.load_scan()
|
|
139
|
+
|
|
140
|
+
scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
|
|
141
|
+
hits = scan(
|
|
142
|
+
self.client,
|
|
143
|
+
query=scan_query,
|
|
144
|
+
scroll="1m",
|
|
145
|
+
index=self.index_config.index_name,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return {hit["_id"] for hit in hits}
|
|
149
|
+
|
|
150
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
151
|
+
all_ids = self._get_doc_ids()
|
|
152
|
+
ids = list(all_ids)
|
|
153
|
+
id_batches: list[frozenset[str]] = [
|
|
154
|
+
frozenset(
|
|
155
|
+
ids[
|
|
156
|
+
i
|
|
157
|
+
* self.index_config.batch_size : (i + 1) # noqa
|
|
158
|
+
* self.index_config.batch_size
|
|
159
|
+
]
|
|
160
|
+
)
|
|
161
|
+
for i in range(
|
|
162
|
+
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
163
|
+
)
|
|
164
|
+
]
|
|
165
|
+
for batch in id_batches:
|
|
166
|
+
# Make sure the hash is always a positive number to create identified
|
|
167
|
+
identified = str(hash(batch) + sys.maxsize + 1)
|
|
168
|
+
yield FileData(
|
|
169
|
+
identifier=identified,
|
|
170
|
+
connector_type=CONNECTOR_TYPE,
|
|
171
|
+
metadata=DataSourceMetadata(
|
|
172
|
+
url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
|
|
173
|
+
date_processed=str(time()),
|
|
174
|
+
),
|
|
175
|
+
additional_metadata={
|
|
176
|
+
"ids": list(batch),
|
|
177
|
+
"index_name": self.index_config.index_name,
|
|
178
|
+
},
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@dataclass
|
|
183
|
+
class ElasticsearchDownloaderConfig(DownloaderConfig):
|
|
184
|
+
fields: list[str] = field(default_factory=list)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@dataclass
|
|
188
|
+
class ElasticsearchDownloader(Downloader):
|
|
189
|
+
connection_config: ElasticsearchConnectionConfig
|
|
190
|
+
download_config: ElasticsearchDownloaderConfig
|
|
191
|
+
connector_type: str = CONNECTOR_TYPE
|
|
192
|
+
|
|
193
|
+
def is_async(self) -> bool:
|
|
194
|
+
return True
|
|
195
|
+
|
|
196
|
+
def get_identifier(self, index_name: str, record_id: str) -> str:
|
|
197
|
+
f = f"{index_name}-{record_id}"
|
|
198
|
+
if self.download_config.fields:
|
|
199
|
+
f = "{}-{}".format(
|
|
200
|
+
f,
|
|
201
|
+
hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
|
|
202
|
+
)
|
|
203
|
+
return f
|
|
204
|
+
|
|
205
|
+
def map_es_results(self, es_results: dict) -> str:
|
|
206
|
+
doc_body = es_results["_source"]
|
|
207
|
+
flattened_dict = flatten_dict(dictionary=doc_body)
|
|
208
|
+
str_values = [str(value) for value in flattened_dict.values()]
|
|
209
|
+
concatenated_values = "\n".join(str_values)
|
|
210
|
+
return concatenated_values
|
|
211
|
+
|
|
212
|
+
def generate_download_response(
|
|
213
|
+
self, result: dict, index_name: str, file_data: FileData
|
|
214
|
+
) -> DownloadResponse:
|
|
215
|
+
record_id = result["_id"]
|
|
216
|
+
filename_id = self.get_identifier(index_name=index_name, record_id=record_id)
|
|
217
|
+
filename = f"{filename_id}.txt"
|
|
218
|
+
download_path = self.download_dir / Path(filename)
|
|
219
|
+
logger.debug(
|
|
220
|
+
f"Downloading results from index {index_name} and id {record_id} to {download_path}"
|
|
221
|
+
)
|
|
222
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
223
|
+
try:
|
|
224
|
+
with open(download_path, "w", encoding="utf8") as f:
|
|
225
|
+
f.write(self.map_es_results(es_results=result))
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.error(
|
|
228
|
+
f"failed to download from index {index_name} "
|
|
229
|
+
f"and id {record_id} to {download_path}: {e}",
|
|
230
|
+
exc_info=True,
|
|
231
|
+
)
|
|
232
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
233
|
+
return DownloadResponse(
|
|
234
|
+
file_data=FileData(
|
|
235
|
+
identifier=filename_id,
|
|
236
|
+
connector_type=CONNECTOR_TYPE,
|
|
237
|
+
metadata=DataSourceMetadata(
|
|
238
|
+
version=str(result["_version"]) if "_version" in result else None,
|
|
239
|
+
date_processed=str(time()),
|
|
240
|
+
record_locator={
|
|
241
|
+
"hosts": self.connection_config.hosts,
|
|
242
|
+
"index_name": index_name,
|
|
243
|
+
"document_id": record_id,
|
|
244
|
+
},
|
|
245
|
+
),
|
|
246
|
+
),
|
|
247
|
+
path=download_path,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
251
|
+
raise NotImplementedError()
|
|
252
|
+
|
|
253
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
254
|
+
def load_async(self):
|
|
255
|
+
from elasticsearch import AsyncElasticsearch
|
|
256
|
+
from elasticsearch.helpers import async_scan
|
|
257
|
+
|
|
258
|
+
return AsyncElasticsearch, async_scan
|
|
259
|
+
|
|
260
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
261
|
+
AsyncClient, async_scan = self.load_async()
|
|
262
|
+
|
|
263
|
+
index_name: str = file_data.additional_metadata["index_name"]
|
|
264
|
+
ids: list[str] = file_data.additional_metadata["ids"]
|
|
265
|
+
|
|
266
|
+
scan_query = {
|
|
267
|
+
"_source": self.download_config.fields,
|
|
268
|
+
"version": True,
|
|
269
|
+
"query": {"ids": {"values": ids}},
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
download_responses = []
|
|
273
|
+
async with AsyncClient(**self.connection_config.get_client_kwargs()) as client:
|
|
274
|
+
async for result in async_scan(
|
|
275
|
+
client,
|
|
276
|
+
query=scan_query,
|
|
277
|
+
scroll="1m",
|
|
278
|
+
index=index_name,
|
|
279
|
+
):
|
|
280
|
+
download_responses.append(
|
|
281
|
+
self.generate_download_response(
|
|
282
|
+
result=result, index_name=index_name, file_data=file_data
|
|
283
|
+
)
|
|
284
|
+
)
|
|
285
|
+
return download_responses
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@dataclass
|
|
289
|
+
class ElasticsearchUploadStagerConfig(UploadStagerConfig):
|
|
290
|
+
index_name: str
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@dataclass
|
|
294
|
+
class ElasticsearchUploadStager(UploadStager):
|
|
295
|
+
upload_stager_config: ElasticsearchUploadStagerConfig
|
|
296
|
+
|
|
297
|
+
def conform_dict(self, data: dict) -> dict:
|
|
298
|
+
resp = {
|
|
299
|
+
"_index": self.upload_stager_config.index_name,
|
|
300
|
+
"_id": str(uuid.uuid4()),
|
|
301
|
+
"_source": {
|
|
302
|
+
"element_id": data.pop("element_id", None),
|
|
303
|
+
"embeddings": data.pop("embeddings", None),
|
|
304
|
+
"text": data.pop("text", None),
|
|
305
|
+
"type": data.pop("type", None),
|
|
306
|
+
},
|
|
307
|
+
}
|
|
308
|
+
if "metadata" in data and isinstance(data["metadata"], dict):
|
|
309
|
+
resp["_source"]["metadata"] = flatten_dict(data["metadata"], separator="-")
|
|
310
|
+
return resp
|
|
311
|
+
|
|
312
|
+
def run(
|
|
313
|
+
self,
|
|
314
|
+
elements_filepath: Path,
|
|
315
|
+
file_data: FileData,
|
|
316
|
+
output_dir: Path,
|
|
317
|
+
output_filename: str,
|
|
318
|
+
**kwargs: Any,
|
|
319
|
+
) -> Path:
|
|
320
|
+
with open(elements_filepath) as elements_file:
|
|
321
|
+
elements_contents = json.load(elements_file)
|
|
322
|
+
conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
|
|
323
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
324
|
+
with open(output_path, "w") as output_file:
|
|
325
|
+
json.dump(conformed_elements, output_file)
|
|
326
|
+
return output_path
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
@dataclass
|
|
330
|
+
class ElasticsearchUploaderConfig(UploaderConfig):
|
|
331
|
+
index_name: str
|
|
332
|
+
batch_size_bytes: int = 15_000_000
|
|
333
|
+
num_threads: int = 4
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
@dataclass
|
|
337
|
+
class ElasticsearchUploader(Uploader):
|
|
338
|
+
connector_type: str = CONNECTOR_TYPE
|
|
339
|
+
upload_config: ElasticsearchUploaderConfig
|
|
340
|
+
connection_config: ElasticsearchConnectionConfig
|
|
341
|
+
|
|
342
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
343
|
+
def load_parallel_bulk(self):
|
|
344
|
+
from elasticsearch.helpers import parallel_bulk
|
|
345
|
+
|
|
346
|
+
return parallel_bulk
|
|
347
|
+
|
|
348
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
349
|
+
parallel_bulk = self.load_parallel_bulk()
|
|
350
|
+
elements_dict = []
|
|
351
|
+
for content in contents:
|
|
352
|
+
with open(content.path) as elements_file:
|
|
353
|
+
elements = json.load(elements_file)
|
|
354
|
+
elements_dict.extend(elements)
|
|
355
|
+
upload_destination = self.connection_config.hosts or self.connection_config.cloud_id
|
|
356
|
+
logger.info(
|
|
357
|
+
f"writing {len(elements_dict)} elements via document batches to destination "
|
|
358
|
+
f"index named {self.upload_config.index_name} at {upload_destination} with "
|
|
359
|
+
f"batch size (in bytes) {self.upload_config.batch_size_bytes} with "
|
|
360
|
+
f"{self.upload_config.num_threads} (number of) threads"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
client = self.connection_config.get_client()
|
|
364
|
+
if not client.indices.exists(index=self.upload_config.index_name):
|
|
365
|
+
logger.warning(
|
|
366
|
+
f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
|
|
367
|
+
f"{self.upload_config.index_name}. "
|
|
368
|
+
f"This may cause issues when uploading."
|
|
369
|
+
)
|
|
370
|
+
for batch in generator_batching_wbytes(
|
|
371
|
+
elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
|
|
372
|
+
):
|
|
373
|
+
for success, info in parallel_bulk(
|
|
374
|
+
client=client,
|
|
375
|
+
actions=batch,
|
|
376
|
+
thread_count=self.upload_config.num_threads,
|
|
377
|
+
):
|
|
378
|
+
if not success:
|
|
379
|
+
logger.error(
|
|
380
|
+
"upload failed for a batch in "
|
|
381
|
+
f"{(self.__class__.__name__).replace('Uploader', '')} "
|
|
382
|
+
"destination connector:",
|
|
383
|
+
info,
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
elasticsearch_source_entry = SourceRegistryEntry(
|
|
388
|
+
connection_config=ElasticsearchConnectionConfig,
|
|
389
|
+
indexer=ElasticsearchIndexer,
|
|
390
|
+
indexer_config=ElasticsearchIndexerConfig,
|
|
391
|
+
downloader=ElasticsearchDownloader,
|
|
392
|
+
downloader_config=ElasticsearchDownloaderConfig,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
elasticsearch_destination_entry = DestinationRegistryEntry(
|
|
396
|
+
connection_config=ElasticsearchConnectionConfig,
|
|
397
|
+
upload_stager_config=ElasticsearchUploadStagerConfig,
|
|
398
|
+
upload_stager=ElasticsearchUploadStager,
|
|
399
|
+
uploader_config=ElasticsearchUploaderConfig,
|
|
400
|
+
uploader=ElasticsearchUploader,
|
|
401
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
add_source_entry,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from .azure import CONNECTOR_TYPE as AZURE_CONNECTOR_TYPE
|
|
9
|
+
from .azure import azure_destination_entry, azure_source_entry
|
|
10
|
+
from .box import CONNECTOR_TYPE as BOX_CONNECTOR_TYPE
|
|
11
|
+
from .box import box_destination_entry, box_source_entry
|
|
12
|
+
from .dropbox import CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE
|
|
13
|
+
from .dropbox import dropbox_destination_entry, dropbox_source_entry
|
|
14
|
+
from .gcs import CONNECTOR_TYPE as GCS_CONNECTOR_TYPE
|
|
15
|
+
from .gcs import gcs_destination_entry, gcs_source_entry
|
|
16
|
+
from .s3 import CONNECTOR_TYPE as S3_CONNECTOR_TYPE
|
|
17
|
+
from .s3 import s3_destination_entry, s3_source_entry
|
|
18
|
+
from .sftp import CONNECTOR_TYPE as SFTP_CONNECTOR_TYPE
|
|
19
|
+
from .sftp import sftp_destination_entry, sftp_source_entry
|
|
20
|
+
|
|
21
|
+
add_source_entry(source_type=AZURE_CONNECTOR_TYPE, entry=azure_source_entry)
|
|
22
|
+
add_destination_entry(destination_type=AZURE_CONNECTOR_TYPE, entry=azure_destination_entry)
|
|
23
|
+
|
|
24
|
+
add_source_entry(source_type=BOX_CONNECTOR_TYPE, entry=box_source_entry)
|
|
25
|
+
add_destination_entry(destination_type=BOX_CONNECTOR_TYPE, entry=box_destination_entry)
|
|
26
|
+
|
|
27
|
+
add_source_entry(source_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_source_entry)
|
|
28
|
+
add_destination_entry(destination_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_destination_entry)
|
|
29
|
+
|
|
30
|
+
add_source_entry(source_type=GCS_CONNECTOR_TYPE, entry=gcs_source_entry)
|
|
31
|
+
add_destination_entry(destination_type=GCS_CONNECTOR_TYPE, entry=gcs_destination_entry)
|
|
32
|
+
|
|
33
|
+
add_source_entry(source_type=S3_CONNECTOR_TYPE, entry=s3_source_entry)
|
|
34
|
+
add_destination_entry(destination_type=S3_CONNECTOR_TYPE, entry=s3_destination_entry)
|
|
35
|
+
|
|
36
|
+
add_source_entry(source_type=SFTP_CONNECTOR_TYPE, entry=sftp_source_entry)
|
|
37
|
+
add_destination_entry(destination_type=SFTP_CONNECTOR_TYPE, entry=sftp_destination_entry)
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Generator, Optional
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
11
|
+
DestinationRegistryEntry,
|
|
12
|
+
SourceRegistryEntry,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
15
|
+
FsspecAccessConfig,
|
|
16
|
+
FsspecConnectionConfig,
|
|
17
|
+
FsspecDownloader,
|
|
18
|
+
FsspecDownloaderConfig,
|
|
19
|
+
FsspecIndexer,
|
|
20
|
+
FsspecIndexerConfig,
|
|
21
|
+
FsspecUploader,
|
|
22
|
+
FsspecUploaderConfig,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
|
|
25
|
+
|
|
26
|
+
CONNECTOR_TYPE = "azure"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def azure_json_serial(obj):
|
|
30
|
+
from azure.storage.blob._models import ContentSettings
|
|
31
|
+
|
|
32
|
+
if isinstance(obj, ContentSettings):
|
|
33
|
+
return dict(obj)
|
|
34
|
+
if isinstance(obj, bytearray):
|
|
35
|
+
return str(obj)
|
|
36
|
+
return json_serial(obj)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class AzureIndexerConfig(FsspecIndexerConfig):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class AzureAccessConfig(FsspecAccessConfig):
|
|
46
|
+
account_name: Optional[str] = None
|
|
47
|
+
account_key: Optional[str] = None
|
|
48
|
+
connection_string: Optional[str] = None
|
|
49
|
+
sas_token: Optional[str] = None
|
|
50
|
+
|
|
51
|
+
def __post_init__(self):
|
|
52
|
+
if self.connection_string is None and self.account_name is None:
|
|
53
|
+
raise ValueError("either connection_string or account_name must be set")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class AzureConnectionConfig(FsspecConnectionConfig):
|
|
58
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["az"])
|
|
59
|
+
access_config: AzureAccessConfig = enhanced_field(
|
|
60
|
+
sensitive=True, default_factory=lambda: AzureAccessConfig()
|
|
61
|
+
)
|
|
62
|
+
connector_type: str = CONNECTOR_TYPE
|
|
63
|
+
|
|
64
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
65
|
+
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
66
|
+
access_configs: dict[str, Any] = {
|
|
67
|
+
k: v for k, v in self.access_config.to_dict().items() if v
|
|
68
|
+
}
|
|
69
|
+
return access_configs
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class AzureIndexer(FsspecIndexer):
|
|
74
|
+
connection_config: AzureConnectionConfig
|
|
75
|
+
index_config: AzureIndexerConfig
|
|
76
|
+
connector_type: str = CONNECTOR_TYPE
|
|
77
|
+
|
|
78
|
+
def sterilize_info(self, path) -> dict:
|
|
79
|
+
info = self.fs.info(path=path)
|
|
80
|
+
return sterilize_dict(data=info, default=azure_json_serial)
|
|
81
|
+
|
|
82
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
83
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
84
|
+
return super().run(**kwargs)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class AzureDownloaderConfig(FsspecDownloaderConfig):
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class AzureDownloader(FsspecDownloader):
|
|
94
|
+
protocol: str = "az"
|
|
95
|
+
connection_config: AzureConnectionConfig
|
|
96
|
+
connector_type: str = CONNECTOR_TYPE
|
|
97
|
+
download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
|
|
98
|
+
|
|
99
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
100
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
101
|
+
return super().run(file_data=file_data, **kwargs)
|
|
102
|
+
|
|
103
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
104
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
105
|
+
return await super().run_async(file_data=file_data, **kwargs)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class AzureUploaderConfig(FsspecUploaderConfig):
|
|
110
|
+
pass
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass
|
|
114
|
+
class AzureUploader(FsspecUploader):
|
|
115
|
+
connector_type: str = CONNECTOR_TYPE
|
|
116
|
+
connection_config: AzureConnectionConfig
|
|
117
|
+
upload_config: AzureUploaderConfig = field(default=None)
|
|
118
|
+
|
|
119
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
120
|
+
def __post_init__(self):
|
|
121
|
+
super().__post_init__()
|
|
122
|
+
|
|
123
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
124
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
125
|
+
return super().run(contents=contents, **kwargs)
|
|
126
|
+
|
|
127
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
128
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
129
|
+
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
azure_source_entry = SourceRegistryEntry(
|
|
133
|
+
indexer=AzureIndexer,
|
|
134
|
+
indexer_config=AzureIndexerConfig,
|
|
135
|
+
downloader=AzureDownloader,
|
|
136
|
+
downloader_config=AzureDownloaderConfig,
|
|
137
|
+
connection_config=AzureConnectionConfig,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
azure_destination_entry = DestinationRegistryEntry(
|
|
141
|
+
uploader=AzureUploader,
|
|
142
|
+
uploader_config=AzureUploaderConfig,
|
|
143
|
+
connection_config=AzureConnectionConfig,
|
|
144
|
+
)
|