unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_default_pandas_dtypes() -> dict[str, Any]:
|
|
9
|
+
return {
|
|
10
|
+
"text": pd.StringDtype(), # type: ignore
|
|
11
|
+
"type": pd.StringDtype(), # type: ignore
|
|
12
|
+
"element_id": pd.StringDtype(), # type: ignore
|
|
13
|
+
"filename": pd.StringDtype(), # Optional[str] # type: ignore
|
|
14
|
+
"filetype": pd.StringDtype(), # Optional[str] # type: ignore
|
|
15
|
+
"file_directory": pd.StringDtype(), # Optional[str] # type: ignore
|
|
16
|
+
"last_modified": pd.StringDtype(), # Optional[str] # type: ignore
|
|
17
|
+
"attached_to_filename": pd.StringDtype(), # Optional[str] # type: ignore
|
|
18
|
+
"parent_id": pd.StringDtype(), # Optional[str], # type: ignore
|
|
19
|
+
"category_depth": "Int64", # Optional[int]
|
|
20
|
+
"image_path": pd.StringDtype(), # Optional[str] # type: ignore
|
|
21
|
+
"languages": object, # Optional[list[str]]
|
|
22
|
+
"page_number": "Int64", # Optional[int]
|
|
23
|
+
"page_name": pd.StringDtype(), # Optional[str] # type: ignore
|
|
24
|
+
"url": pd.StringDtype(), # Optional[str] # type: ignore
|
|
25
|
+
"link_urls": pd.StringDtype(), # Optional[str] # type: ignore
|
|
26
|
+
"link_texts": object, # Optional[list[str]]
|
|
27
|
+
"links": object,
|
|
28
|
+
"sent_from": object, # Optional[list[str]],
|
|
29
|
+
"sent_to": object, # Optional[list[str]]
|
|
30
|
+
"subject": pd.StringDtype(), # Optional[str] # type: ignore
|
|
31
|
+
"section": pd.StringDtype(), # Optional[str] # type: ignore
|
|
32
|
+
"header_footer_type": pd.StringDtype(), # Optional[str] # type: ignore
|
|
33
|
+
"emphasized_text_contents": object, # Optional[list[str]]
|
|
34
|
+
"emphasized_text_tags": object, # Optional[list[str]]
|
|
35
|
+
"text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
|
|
36
|
+
"regex_metadata": object,
|
|
37
|
+
"max_characters": "Int64", # Optional[int]
|
|
38
|
+
"is_continuation": "boolean", # Optional[bool]
|
|
39
|
+
"detection_class_prob": float, # Optional[float],
|
|
40
|
+
"sender": pd.StringDtype(), # type: ignore
|
|
41
|
+
"coordinates_points": object,
|
|
42
|
+
"coordinates_system": pd.StringDtype(), # type: ignore
|
|
43
|
+
"coordinates_layout_width": float,
|
|
44
|
+
"coordinates_layout_height": float,
|
|
45
|
+
"data_source_url": pd.StringDtype(), # Optional[str] # type: ignore
|
|
46
|
+
"data_source_version": pd.StringDtype(), # Optional[str] # type: ignore
|
|
47
|
+
"data_source_record_locator": object,
|
|
48
|
+
"data_source_date_created": pd.StringDtype(), # Optional[str] # type: ignore
|
|
49
|
+
"data_source_date_modified": pd.StringDtype(), # Optional[str] # type: ignore
|
|
50
|
+
"data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
|
|
51
|
+
"data_source_permissions_data": object,
|
|
52
|
+
"embeddings": object,
|
|
53
|
+
"regex_metadata_key": object,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def convert_to_pandas_dataframe(
|
|
58
|
+
elements_dict: list[dict[str, Any]],
|
|
59
|
+
drop_empty_cols: bool = False,
|
|
60
|
+
) -> pd.DataFrame:
|
|
61
|
+
# Flatten metadata if it hasn't already been flattened
|
|
62
|
+
for d in elements_dict:
|
|
63
|
+
if metadata := d.pop("metadata", None):
|
|
64
|
+
d.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
|
|
65
|
+
|
|
66
|
+
df = pd.DataFrame.from_dict(
|
|
67
|
+
elements_dict,
|
|
68
|
+
)
|
|
69
|
+
dt = {k: v for k, v in get_default_pandas_dtypes().items() if k in df.columns}
|
|
70
|
+
df = df.astype(dt)
|
|
71
|
+
if drop_empty_cols:
|
|
72
|
+
df.dropna(axis=1, how="all", inplace=True)
|
|
73
|
+
return df
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
File without changes
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import dataclass, field, fields
|
|
4
|
+
from typing import Any, Optional, Type, TypeVar
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.v2.cli.base.importer import import_from_string
|
|
9
|
+
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
10
|
+
from unstructured_ingest.v2.cli.utils import extract_config
|
|
11
|
+
from unstructured_ingest.v2.interfaces import ProcessorConfig
|
|
12
|
+
from unstructured_ingest.v2.logger import logger
|
|
13
|
+
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
|
|
14
|
+
from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
|
|
15
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
16
|
+
DownloaderT,
|
|
17
|
+
IndexerT,
|
|
18
|
+
UploaderT,
|
|
19
|
+
UploadStager,
|
|
20
|
+
UploadStagerConfig,
|
|
21
|
+
UploadStagerT,
|
|
22
|
+
destination_registry,
|
|
23
|
+
source_registry,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
|
|
26
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
27
|
+
from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
|
|
28
|
+
|
|
29
|
+
CommandT = TypeVar("CommandT", bound=click.Command)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class BaseCmd(ABC):
|
|
34
|
+
cmd_name: str
|
|
35
|
+
default_configs: list[Type[CliConfig]] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def cmd_name_key(self):
|
|
39
|
+
return self.cmd_name.replace("-", "_")
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def cli_cmd_name(self):
|
|
43
|
+
return self.cmd_name.replace("_", "-")
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def cmd(self, ctx: click.Context, **options) -> None:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
def add_options(self, cmd: CommandT, extras: list[Type[CliConfig]]) -> CommandT:
|
|
50
|
+
configs = self.default_configs
|
|
51
|
+
# make sure what's unique to this cmd appears first
|
|
52
|
+
extras.extend(configs)
|
|
53
|
+
for config in extras:
|
|
54
|
+
try:
|
|
55
|
+
config.add_cli_options(cmd=cmd)
|
|
56
|
+
except ValueError as e:
|
|
57
|
+
raise ValueError(f"failed to set configs from {config.__name__}: {e}")
|
|
58
|
+
return cmd
|
|
59
|
+
|
|
60
|
+
def get_pipline(
|
|
61
|
+
self,
|
|
62
|
+
src: str,
|
|
63
|
+
source_options: dict[str, Any],
|
|
64
|
+
dest: Optional[str] = None,
|
|
65
|
+
destination_options: Optional[dict[str, Any]] = None,
|
|
66
|
+
) -> Pipeline:
|
|
67
|
+
logger.debug(
|
|
68
|
+
f"creating pipeline from cli using source {src} with options: {source_options}"
|
|
69
|
+
)
|
|
70
|
+
pipeline_kwargs: dict[str, Any] = {
|
|
71
|
+
"context": self.get_processor_config(options=source_options),
|
|
72
|
+
"downloader": self.get_downloader(src=src, options=source_options),
|
|
73
|
+
"indexer": self.get_indexer(src=src, options=source_options),
|
|
74
|
+
"partitioner": self.get_partitioner(options=source_options),
|
|
75
|
+
}
|
|
76
|
+
if chunker := self.get_chunker(options=source_options):
|
|
77
|
+
pipeline_kwargs["chunker"] = chunker
|
|
78
|
+
if embedder := self.get_embeder(options=source_options):
|
|
79
|
+
pipeline_kwargs["embedder"] = embedder
|
|
80
|
+
if dest:
|
|
81
|
+
logger.debug(
|
|
82
|
+
f"setting destination on pipeline {dest} with options: {destination_options}"
|
|
83
|
+
)
|
|
84
|
+
if uploader_stager := self.get_upload_stager(dest=dest, options=destination_options):
|
|
85
|
+
pipeline_kwargs["stager"] = uploader_stager
|
|
86
|
+
pipeline_kwargs["uploader"] = self.get_uploader(dest=dest, options=destination_options)
|
|
87
|
+
else:
|
|
88
|
+
# Default to local uploader
|
|
89
|
+
# TODO remove after v1 no longer supported
|
|
90
|
+
destination_options = destination_options or {}
|
|
91
|
+
if "output_dir" not in destination_options:
|
|
92
|
+
destination_options["output_dir"] = source_options["output_dir"]
|
|
93
|
+
pipeline_kwargs["uploader"] = self.get_default_uploader(options=destination_options)
|
|
94
|
+
return Pipeline(**pipeline_kwargs)
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def get_default_uploader(options: dict[str, Any]) -> UploaderT:
|
|
98
|
+
uploader_config = extract_config(flat_data=options, config=LocalUploaderConfig)
|
|
99
|
+
return LocalUploader(upload_config=uploader_config)
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
def get_chunker(options: dict[str, Any]) -> Optional[Chunker]:
|
|
103
|
+
chunker_config = extract_config(flat_data=options, config=ChunkerConfig)
|
|
104
|
+
if not chunker_config.chunking_strategy:
|
|
105
|
+
return None
|
|
106
|
+
return Chunker(config=chunker_config)
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
|
|
110
|
+
embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
|
|
111
|
+
if not embedder_config.embedding_provider:
|
|
112
|
+
return None
|
|
113
|
+
return Embedder(config=embedder_config)
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
def get_partitioner(options: dict[str, Any]) -> Partitioner:
|
|
117
|
+
partitioner_config = extract_config(flat_data=options, config=PartitionerConfig)
|
|
118
|
+
return Partitioner(config=partitioner_config)
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def get_processor_config(options: dict[str, Any]) -> ProcessorConfig:
|
|
122
|
+
return extract_config(flat_data=options, config=ProcessorConfig)
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def get_indexer(src: str, options: dict[str, Any]) -> IndexerT:
|
|
126
|
+
source_entry = source_registry[src]
|
|
127
|
+
indexer_kwargs: dict[str, Any] = {}
|
|
128
|
+
if indexer_config_cls := source_entry.indexer_config:
|
|
129
|
+
indexer_kwargs["index_config"] = extract_config(
|
|
130
|
+
flat_data=options, config=indexer_config_cls
|
|
131
|
+
)
|
|
132
|
+
if connection_config_cls := source_entry.connection_config:
|
|
133
|
+
indexer_kwargs["connection_config"] = extract_config(
|
|
134
|
+
flat_data=options, config=connection_config_cls
|
|
135
|
+
)
|
|
136
|
+
indexer_cls = source_entry.indexer
|
|
137
|
+
return indexer_cls(**indexer_kwargs)
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def get_downloader(src: str, options: dict[str, Any]) -> DownloaderT:
|
|
141
|
+
source_entry = source_registry[src]
|
|
142
|
+
downloader_kwargs: dict[str, Any] = {}
|
|
143
|
+
if downloader_config_cls := source_entry.downloader_config:
|
|
144
|
+
downloader_kwargs["download_config"] = extract_config(
|
|
145
|
+
flat_data=options, config=downloader_config_cls
|
|
146
|
+
)
|
|
147
|
+
if connection_config_cls := source_entry.connection_config:
|
|
148
|
+
downloader_kwargs["connection_config"] = extract_config(
|
|
149
|
+
flat_data=options, config=connection_config_cls
|
|
150
|
+
)
|
|
151
|
+
downloader_cls = source_entry.downloader
|
|
152
|
+
return downloader_cls(**downloader_kwargs)
|
|
153
|
+
|
|
154
|
+
@staticmethod
|
|
155
|
+
def get_custom_stager(
|
|
156
|
+
stager_reference: str, stager_config_kwargs: Optional[dict] = None
|
|
157
|
+
) -> Optional[UploadStagerT]:
|
|
158
|
+
uploader_cls = import_from_string(stager_reference)
|
|
159
|
+
if not inspect.isclass(uploader_cls):
|
|
160
|
+
raise ValueError(
|
|
161
|
+
f"custom stager must be a reference to a python class, got: {type(uploader_cls)}"
|
|
162
|
+
)
|
|
163
|
+
if not issubclass(uploader_cls, UploadStager):
|
|
164
|
+
raise ValueError(
|
|
165
|
+
"custom stager must be an implementation of the UploadStager interface"
|
|
166
|
+
)
|
|
167
|
+
fields_dict = {f.name: f.type for f in fields(uploader_cls)}
|
|
168
|
+
upload_stager_config_cls = fields_dict["upload_stager_config"]
|
|
169
|
+
if not inspect.isclass(upload_stager_config_cls):
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"custom stager config must be a class, got: {type(upload_stager_config_cls)}"
|
|
172
|
+
)
|
|
173
|
+
if not issubclass(upload_stager_config_cls, UploadStagerConfig):
|
|
174
|
+
raise ValueError(
|
|
175
|
+
"custom stager config must be an implementation "
|
|
176
|
+
"of the UploadStagerUploadStagerConfig interface"
|
|
177
|
+
)
|
|
178
|
+
upload_stager_kwargs: dict[str, Any] = {}
|
|
179
|
+
if stager_config_kwargs:
|
|
180
|
+
upload_stager_kwargs["upload_stager_config"] = upload_stager_config_cls(
|
|
181
|
+
**stager_config_kwargs
|
|
182
|
+
)
|
|
183
|
+
return uploader_cls(**upload_stager_kwargs)
|
|
184
|
+
|
|
185
|
+
@staticmethod
|
|
186
|
+
def get_upload_stager(dest: str, options: dict[str, Any]) -> Optional[UploadStagerT]:
|
|
187
|
+
if custom_stager := options.get("custom_stager"):
|
|
188
|
+
return BaseCmd.get_custom_stager(
|
|
189
|
+
stager_reference=custom_stager,
|
|
190
|
+
stager_config_kwargs=options.get("custom_stager_config_kwargs"),
|
|
191
|
+
)
|
|
192
|
+
dest_entry = destination_registry[dest]
|
|
193
|
+
upload_stager_kwargs: dict[str, Any] = {}
|
|
194
|
+
if upload_stager_config_cls := dest_entry.upload_stager_config:
|
|
195
|
+
upload_stager_kwargs["upload_stager_config"] = extract_config(
|
|
196
|
+
flat_data=options, config=upload_stager_config_cls
|
|
197
|
+
)
|
|
198
|
+
if upload_stager_cls := dest_entry.upload_stager:
|
|
199
|
+
return upload_stager_cls(**upload_stager_kwargs)
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
@staticmethod
|
|
203
|
+
def get_uploader(dest, options: dict[str, Any]) -> UploaderT:
|
|
204
|
+
dest_entry = destination_registry[dest]
|
|
205
|
+
uploader_kwargs: dict[str, Any] = {}
|
|
206
|
+
if uploader_config_cls := dest_entry.uploader_config:
|
|
207
|
+
uploader_kwargs["upload_config"] = extract_config(
|
|
208
|
+
flat_data=options, config=uploader_config_cls
|
|
209
|
+
)
|
|
210
|
+
if connection_config_cls := dest_entry.connection_config:
|
|
211
|
+
uploader_kwargs["connection_config"] = extract_config(
|
|
212
|
+
flat_data=options, config=connection_config_cls
|
|
213
|
+
)
|
|
214
|
+
uploader_cls = dest_entry.uploader
|
|
215
|
+
return uploader_cls(**uploader_kwargs)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Optional, Type
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.cli.base.cmd import BaseCmd
|
|
8
|
+
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
9
|
+
from unstructured_ingest.v2.cli.utils import Dict, conform_click_options
|
|
10
|
+
from unstructured_ingest.v2.logger import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DestCmd(BaseCmd):
|
|
15
|
+
connection_config: Optional[Type[CliConfig]] = None
|
|
16
|
+
uploader_config: Optional[Type[CliConfig]] = None
|
|
17
|
+
upload_stager_config: Optional[Type[CliConfig]] = None
|
|
18
|
+
|
|
19
|
+
def cmd(self, ctx: click.Context, **options) -> None:
|
|
20
|
+
logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
|
|
21
|
+
if not ctx.parent:
|
|
22
|
+
raise click.ClickException("destination command called without a parent")
|
|
23
|
+
if not ctx.parent.info_name:
|
|
24
|
+
raise click.ClickException("parent command missing info name")
|
|
25
|
+
source_cmd = ctx.parent.info_name.replace("-", "_")
|
|
26
|
+
source_options: dict = ctx.parent.params if ctx.parent else {}
|
|
27
|
+
conform_click_options(options)
|
|
28
|
+
try:
|
|
29
|
+
pipeline = self.get_pipline(
|
|
30
|
+
src=source_cmd,
|
|
31
|
+
source_options=source_options,
|
|
32
|
+
dest=self.cmd_name,
|
|
33
|
+
destination_options=options,
|
|
34
|
+
)
|
|
35
|
+
pipeline.run()
|
|
36
|
+
except Exception as e:
|
|
37
|
+
logger.error(f"failed to run destination command {self.cmd_name}: {e}", exc_info=True)
|
|
38
|
+
raise click.ClickException(str(e)) from e
|
|
39
|
+
|
|
40
|
+
def get_cmd(self) -> click.Command:
|
|
41
|
+
# Dynamically create the command without the use of click decorators
|
|
42
|
+
fn = self.cmd
|
|
43
|
+
fn = click.pass_context(fn)
|
|
44
|
+
cmd = click.command(fn)
|
|
45
|
+
if not isinstance(cmd, click.core.Command):
|
|
46
|
+
raise ValueError(f"generated command was not of expected type Command: {type(cmd)}")
|
|
47
|
+
cmd.name = self.cli_cmd_name
|
|
48
|
+
cmd.short_help = "v2"
|
|
49
|
+
cmd.invoke_without_command = True
|
|
50
|
+
extras = [
|
|
51
|
+
x
|
|
52
|
+
for x in [self.uploader_config, self.upload_stager_config, self.connection_config]
|
|
53
|
+
if x
|
|
54
|
+
]
|
|
55
|
+
self.add_options(cmd, extras=extras)
|
|
56
|
+
cmd.params.append(
|
|
57
|
+
click.Option(
|
|
58
|
+
["--custom-stager"],
|
|
59
|
+
required=False,
|
|
60
|
+
type=str,
|
|
61
|
+
default=None,
|
|
62
|
+
help="Pass a pointer to a custom upload stager to use, "
|
|
63
|
+
"must be in format '<module>:<attribute>'",
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
cmd.params.append(
|
|
67
|
+
click.Option(
|
|
68
|
+
["--custom-stager-config-kwargs"],
|
|
69
|
+
required=False,
|
|
70
|
+
type=Dict(),
|
|
71
|
+
default=None,
|
|
72
|
+
help="Any kwargs to instantiate the configuration "
|
|
73
|
+
"associated with the customer stager",
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
return cmd
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ImportFromStringError(Exception):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def import_from_string(import_str: Any) -> Any:
|
|
10
|
+
if not isinstance(import_str, str):
|
|
11
|
+
return import_str
|
|
12
|
+
|
|
13
|
+
module_str, _, attrs_str = import_str.partition(":")
|
|
14
|
+
if not module_str or not attrs_str:
|
|
15
|
+
message = 'Import string "{import_str}" must be in format "<module>:<attribute>".'
|
|
16
|
+
raise ImportFromStringError(message.format(import_str=import_str))
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
module = importlib.import_module(module_str)
|
|
20
|
+
except ModuleNotFoundError as exc:
|
|
21
|
+
if exc.name != module_str:
|
|
22
|
+
raise exc from None
|
|
23
|
+
message = 'Could not import module "{module_str}".'
|
|
24
|
+
raise ImportFromStringError(message.format(module_str=module_str))
|
|
25
|
+
|
|
26
|
+
instance = module
|
|
27
|
+
try:
|
|
28
|
+
for attr_str in attrs_str.split("."):
|
|
29
|
+
instance = getattr(instance, attr_str)
|
|
30
|
+
except AttributeError:
|
|
31
|
+
message = 'Attribute "{attrs_str}" not found in module "{module_str}".'
|
|
32
|
+
raise ImportFromStringError(message.format(attrs_str=attrs_str, module_str=module_str))
|
|
33
|
+
|
|
34
|
+
return instance
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Optional, Type
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.cli.base.cmd import BaseCmd
|
|
8
|
+
from unstructured_ingest.v2.cli.configs import (
|
|
9
|
+
ChunkerCliConfig,
|
|
10
|
+
EmbedderCliConfig,
|
|
11
|
+
PartitionerCliConfig,
|
|
12
|
+
ProcessorCliConfig,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
15
|
+
from unstructured_ingest.v2.cli.utils import Group, conform_click_options
|
|
16
|
+
from unstructured_ingest.v2.logger import logger
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class SrcCmd(BaseCmd):
|
|
21
|
+
indexer_config: Optional[Type[CliConfig]] = None
|
|
22
|
+
downloader_config: Optional[Type[CliConfig]] = None
|
|
23
|
+
connection_config: Optional[Type[CliConfig]] = None
|
|
24
|
+
default_configs: list[CliConfig] = field(
|
|
25
|
+
default_factory=lambda: [
|
|
26
|
+
ProcessorCliConfig,
|
|
27
|
+
PartitionerCliConfig,
|
|
28
|
+
EmbedderCliConfig,
|
|
29
|
+
ChunkerCliConfig,
|
|
30
|
+
]
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def cmd(self, ctx: click.Context, **options) -> None:
|
|
34
|
+
if ctx.invoked_subcommand:
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
conform_click_options(options)
|
|
38
|
+
logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
|
|
39
|
+
try:
|
|
40
|
+
pipeline = self.get_pipline(src=self.cmd_name, source_options=options)
|
|
41
|
+
pipeline.run()
|
|
42
|
+
except Exception as e:
|
|
43
|
+
logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
|
|
44
|
+
raise click.ClickException(str(e)) from e
|
|
45
|
+
|
|
46
|
+
def get_cmd(self) -> click.Group:
|
|
47
|
+
# Dynamically create the command without the use of click decorators
|
|
48
|
+
fn = self.cmd
|
|
49
|
+
fn = click.pass_context(fn)
|
|
50
|
+
cmd = click.group(fn, cls=Group)
|
|
51
|
+
if not isinstance(cmd, click.core.Group):
|
|
52
|
+
raise ValueError(f"generated src command was not of expected type Group: {type(cmd)}")
|
|
53
|
+
cmd.name = self.cli_cmd_name
|
|
54
|
+
cmd.short_help = "v2"
|
|
55
|
+
cmd.invoke_without_command = True
|
|
56
|
+
extras = [
|
|
57
|
+
x for x in [self.indexer_config, self.downloader_config, self.connection_config] if x
|
|
58
|
+
]
|
|
59
|
+
self.add_options(cmd, extras=extras)
|
|
60
|
+
|
|
61
|
+
# TODO remove after v1 no longer supported
|
|
62
|
+
cmd.params.append(
|
|
63
|
+
click.Option(
|
|
64
|
+
["--output-dir"],
|
|
65
|
+
required=False,
|
|
66
|
+
type=str,
|
|
67
|
+
help="Local path to write partitioned output to",
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
return cmd
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import click
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.cli.cmds import dest, src
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@click.group()
|
|
7
|
+
def ingest():
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_cmd() -> click.Command:
|
|
12
|
+
"""Construct and return a Click command object representing the main command for the CLI.
|
|
13
|
+
|
|
14
|
+
This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
|
|
15
|
+
to the main command as nested subcommands.
|
|
16
|
+
"""
|
|
17
|
+
cmd = ingest
|
|
18
|
+
# Add all subcommands
|
|
19
|
+
for src_subcommand in src:
|
|
20
|
+
# Add all destination subcommands
|
|
21
|
+
for dest_subcommand in dest:
|
|
22
|
+
src_subcommand.add_command(dest_subcommand)
|
|
23
|
+
cmd.add_command(src_subcommand)
|
|
24
|
+
return cmd
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
|
+
from .astra import astra_dest_cmd
|
|
6
|
+
from .azure_cognitive_search import azure_cognitive_search_dest_cmd
|
|
7
|
+
from .chroma import chroma_dest_cmd
|
|
8
|
+
from .databricks_volumes import databricks_volumes_dest_cmd
|
|
9
|
+
from .elasticsearch import elasticsearch_dest_cmd, elasticsearch_src_cmd
|
|
10
|
+
from .fsspec.azure import azure_dest_cmd, azure_src_cmd
|
|
11
|
+
from .fsspec.box import box_dest_cmd, box_src_cmd
|
|
12
|
+
from .fsspec.dropbox import dropbox_dest_cmd, dropbox_src_cmd
|
|
13
|
+
from .fsspec.gcs import gcs_dest_cmd, gcs_src_cmd
|
|
14
|
+
from .fsspec.s3 import s3_dest_cmd, s3_src_cmd
|
|
15
|
+
from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd
|
|
16
|
+
from .google_drive import google_drive_src_cmd
|
|
17
|
+
from .local import local_dest_cmd, local_src_cmd
|
|
18
|
+
from .mongodb import mongodb_dest_cmd
|
|
19
|
+
from .onedrive import onedrive_drive_src_cmd
|
|
20
|
+
from .opensearch import opensearch_dest_cmd, opensearch_src_cmd
|
|
21
|
+
from .pinecone import pinecone_dest_cmd
|
|
22
|
+
from .salesforce import salesforce_src_cmd
|
|
23
|
+
from .sharepoint import sharepoint_drive_src_cmd
|
|
24
|
+
from .singlestore import singlestore_dest_cmd
|
|
25
|
+
from .sql import sql_dest_cmd
|
|
26
|
+
from .weaviate import weaviate_dest_cmd
|
|
27
|
+
|
|
28
|
+
src_cmds = [
|
|
29
|
+
azure_src_cmd,
|
|
30
|
+
box_src_cmd,
|
|
31
|
+
dropbox_src_cmd,
|
|
32
|
+
elasticsearch_src_cmd,
|
|
33
|
+
gcs_src_cmd,
|
|
34
|
+
google_drive_src_cmd,
|
|
35
|
+
local_src_cmd,
|
|
36
|
+
onedrive_drive_src_cmd,
|
|
37
|
+
opensearch_src_cmd,
|
|
38
|
+
s3_src_cmd,
|
|
39
|
+
salesforce_src_cmd,
|
|
40
|
+
sharepoint_drive_src_cmd,
|
|
41
|
+
sftp_src_cmd,
|
|
42
|
+
]
|
|
43
|
+
duplicate_src_names = [
|
|
44
|
+
name for name, count in Counter([s.cmd_name for s in src_cmds]).items() if count > 1
|
|
45
|
+
]
|
|
46
|
+
if duplicate_src_names:
|
|
47
|
+
raise ValueError(
|
|
48
|
+
"the following source cmd names were reused, all must be unique: {}".format(
|
|
49
|
+
", ".join(duplicate_src_names)
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
dest_cmds = [
|
|
54
|
+
astra_dest_cmd,
|
|
55
|
+
azure_cognitive_search_dest_cmd,
|
|
56
|
+
azure_dest_cmd,
|
|
57
|
+
box_dest_cmd,
|
|
58
|
+
chroma_dest_cmd,
|
|
59
|
+
dropbox_dest_cmd,
|
|
60
|
+
elasticsearch_dest_cmd,
|
|
61
|
+
gcs_dest_cmd,
|
|
62
|
+
local_dest_cmd,
|
|
63
|
+
opensearch_dest_cmd,
|
|
64
|
+
pinecone_dest_cmd,
|
|
65
|
+
s3_dest_cmd,
|
|
66
|
+
sftp_dest_cmd,
|
|
67
|
+
singlestore_dest_cmd,
|
|
68
|
+
weaviate_dest_cmd,
|
|
69
|
+
mongodb_dest_cmd,
|
|
70
|
+
databricks_volumes_dest_cmd,
|
|
71
|
+
sql_dest_cmd,
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
duplicate_dest_names = [
|
|
75
|
+
name for name, count in Counter([d.cmd_name for d in dest_cmds]).items() if count > 1
|
|
76
|
+
]
|
|
77
|
+
if duplicate_dest_names:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
"the following dest cmd names were reused, all must be unique: {}".format(
|
|
80
|
+
", ".join(duplicate_dest_names)
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
src: list[click.Group] = [v.get_cmd() for v in src_cmds]
|
|
86
|
+
|
|
87
|
+
dest: list[click.Command] = [v.get_cmd() for v in dest_cmds]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
+
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
+
from unstructured_ingest.v2.cli.utils import Dict
|
|
8
|
+
from unstructured_ingest.v2.processes.connectors.astra import CONNECTOR_TYPE
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class AstraCliConnectionConfig(CliConfig):
|
|
13
|
+
@staticmethod
|
|
14
|
+
def get_cli_options() -> list[click.Option]:
|
|
15
|
+
options = [
|
|
16
|
+
click.Option(
|
|
17
|
+
["--token"],
|
|
18
|
+
required=True,
|
|
19
|
+
type=str,
|
|
20
|
+
help="Astra DB Token with access to the database.",
|
|
21
|
+
envvar="ASTRA_DB_TOKEN",
|
|
22
|
+
show_envvar=True,
|
|
23
|
+
),
|
|
24
|
+
click.Option(
|
|
25
|
+
["--api-endpoint"],
|
|
26
|
+
required=True,
|
|
27
|
+
type=str,
|
|
28
|
+
help="The API endpoint for the Astra DB.",
|
|
29
|
+
envvar="ASTRA_DB_ENDPOINT",
|
|
30
|
+
show_envvar=True,
|
|
31
|
+
),
|
|
32
|
+
]
|
|
33
|
+
return options
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class AstraCliUploaderConfig(CliConfig):
|
|
38
|
+
@staticmethod
|
|
39
|
+
def get_cli_options() -> list[click.Option]:
|
|
40
|
+
options = [
|
|
41
|
+
click.Option(
|
|
42
|
+
["--collection-name"],
|
|
43
|
+
required=False,
|
|
44
|
+
type=str,
|
|
45
|
+
help="The name of the Astra DB collection. "
|
|
46
|
+
"Note that the collection name must only include letters, "
|
|
47
|
+
"numbers, and underscores.",
|
|
48
|
+
),
|
|
49
|
+
click.Option(
|
|
50
|
+
["--embedding-dimension"],
|
|
51
|
+
required=True,
|
|
52
|
+
default=384,
|
|
53
|
+
type=int,
|
|
54
|
+
help="The dimensionality of the embeddings",
|
|
55
|
+
),
|
|
56
|
+
click.Option(
|
|
57
|
+
["--namespace"],
|
|
58
|
+
required=False,
|
|
59
|
+
default=None,
|
|
60
|
+
type=str,
|
|
61
|
+
help="The Astra DB connection namespace.",
|
|
62
|
+
),
|
|
63
|
+
click.Option(
|
|
64
|
+
["--requested-indexing-policy"],
|
|
65
|
+
required=False,
|
|
66
|
+
default=None,
|
|
67
|
+
type=Dict(),
|
|
68
|
+
help="The indexing policy to use for the collection."
|
|
69
|
+
'example: \'{"deny": ["metadata"]}\' ',
|
|
70
|
+
),
|
|
71
|
+
click.Option(
|
|
72
|
+
["--batch-size"],
|
|
73
|
+
default=20,
|
|
74
|
+
type=int,
|
|
75
|
+
help="Number of records per batch",
|
|
76
|
+
),
|
|
77
|
+
]
|
|
78
|
+
return options
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
astra_dest_cmd = DestCmd(
|
|
82
|
+
cmd_name=CONNECTOR_TYPE,
|
|
83
|
+
connection_config=AstraCliConnectionConfig,
|
|
84
|
+
uploader_config=AstraCliUploaderConfig,
|
|
85
|
+
)
|