unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.interfaces import (
|
|
7
|
+
CliConfig,
|
|
8
|
+
)
|
|
9
|
+
from unstructured_ingest.connector.qdrant import QdrantWriteConfig, SimpleQdrantConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class QdrantCliConfig(SimpleQdrantConfig, CliConfig):
|
|
14
|
+
@staticmethod
|
|
15
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
16
|
+
options = [
|
|
17
|
+
click.Option(
|
|
18
|
+
["--collection-name"],
|
|
19
|
+
required=True,
|
|
20
|
+
type=str,
|
|
21
|
+
help="The name of the Qdrant collection to use.",
|
|
22
|
+
),
|
|
23
|
+
click.Option(
|
|
24
|
+
["--location"],
|
|
25
|
+
type=str,
|
|
26
|
+
help="The location of the Qdrant cluster.",
|
|
27
|
+
),
|
|
28
|
+
click.Option(
|
|
29
|
+
["--url"],
|
|
30
|
+
type=str,
|
|
31
|
+
help="The location of the Qdrant cluster.",
|
|
32
|
+
),
|
|
33
|
+
click.Option(
|
|
34
|
+
["--port"],
|
|
35
|
+
type=int,
|
|
36
|
+
default=6333,
|
|
37
|
+
help="Port of the REST API interface. Default: 6333.",
|
|
38
|
+
),
|
|
39
|
+
click.Option(
|
|
40
|
+
["--grpc-port"],
|
|
41
|
+
type=int,
|
|
42
|
+
default=6334,
|
|
43
|
+
help="Port of the gRPC interface. Default: 6334.",
|
|
44
|
+
),
|
|
45
|
+
click.Option(
|
|
46
|
+
["--prefer-grpc"],
|
|
47
|
+
type=bool,
|
|
48
|
+
is_flag=True,
|
|
49
|
+
help="Whether to use gPRC interface whenever possible in methods. Default: False.",
|
|
50
|
+
),
|
|
51
|
+
click.Option(
|
|
52
|
+
["--https"],
|
|
53
|
+
type=bool,
|
|
54
|
+
is_flag=True,
|
|
55
|
+
help="Whether to use HTTPS(SSL) protocol. Default: False.",
|
|
56
|
+
),
|
|
57
|
+
click.Option(
|
|
58
|
+
["--prefix"],
|
|
59
|
+
type=str,
|
|
60
|
+
help="Prefix to add the REST API endpoints.",
|
|
61
|
+
),
|
|
62
|
+
click.Option(
|
|
63
|
+
["--timeout"],
|
|
64
|
+
type=int,
|
|
65
|
+
help="Timeout for operations. Default: 5.0 seconds for REST, unlimited for gRPC.",
|
|
66
|
+
),
|
|
67
|
+
click.Option(
|
|
68
|
+
["--host"],
|
|
69
|
+
type=str,
|
|
70
|
+
help="Host name of the Qdrant service.",
|
|
71
|
+
),
|
|
72
|
+
click.Option(
|
|
73
|
+
["--path"],
|
|
74
|
+
type=str,
|
|
75
|
+
help="Persistence path for QdrantLocal.",
|
|
76
|
+
),
|
|
77
|
+
click.Option(
|
|
78
|
+
["--force-disable-check-same-thread"],
|
|
79
|
+
type=bool,
|
|
80
|
+
is_flag=True,
|
|
81
|
+
help="Whether to force disable check same thread for QdrantLocal.",
|
|
82
|
+
),
|
|
83
|
+
click.Option(
|
|
84
|
+
["--api-key"],
|
|
85
|
+
type=str,
|
|
86
|
+
help="API key for authentication in Qdrant Cloud. Default: None.",
|
|
87
|
+
envvar="QDRANT_API_KEY",
|
|
88
|
+
show_envvar=True,
|
|
89
|
+
),
|
|
90
|
+
]
|
|
91
|
+
return options
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class QdrantCliWriteConfig(QdrantWriteConfig, CliConfig):
|
|
96
|
+
@staticmethod
|
|
97
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
98
|
+
options = [
|
|
99
|
+
click.Option(
|
|
100
|
+
["--batch-size"],
|
|
101
|
+
default=50,
|
|
102
|
+
type=int,
|
|
103
|
+
help="Number of points to upload per batch",
|
|
104
|
+
),
|
|
105
|
+
click.Option(
|
|
106
|
+
["--num-processes"],
|
|
107
|
+
default=2,
|
|
108
|
+
type=int,
|
|
109
|
+
help="Number of parallel processes with which to upload",
|
|
110
|
+
),
|
|
111
|
+
]
|
|
112
|
+
return options
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_base_dest_cmd():
|
|
116
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
117
|
+
|
|
118
|
+
cmd_cls = BaseDestCmd(
|
|
119
|
+
cmd_name="qdrant",
|
|
120
|
+
cli_config=QdrantCliConfig,
|
|
121
|
+
additional_cli_options=[QdrantCliWriteConfig],
|
|
122
|
+
write_config=QdrantWriteConfig,
|
|
123
|
+
)
|
|
124
|
+
return cmd_cls
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.connector.reddit import SimpleRedditConfig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class RedditCliConfig(SimpleRedditConfig, CliConfig):
|
|
15
|
+
@staticmethod
|
|
16
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
17
|
+
options = [
|
|
18
|
+
click.Option(
|
|
19
|
+
["--client-id"],
|
|
20
|
+
required=True,
|
|
21
|
+
type=str,
|
|
22
|
+
help="The client ID, see "
|
|
23
|
+
"https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#prerequisites" # noqa: E501
|
|
24
|
+
" for more information.",
|
|
25
|
+
),
|
|
26
|
+
click.Option(
|
|
27
|
+
["--client-secret"],
|
|
28
|
+
required=True,
|
|
29
|
+
type=str,
|
|
30
|
+
help="The client secret, see "
|
|
31
|
+
"https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#prerequisites" # noqa: E501
|
|
32
|
+
" for more information.",
|
|
33
|
+
),
|
|
34
|
+
click.Option(
|
|
35
|
+
["--subreddit-name"],
|
|
36
|
+
required=True,
|
|
37
|
+
type=str,
|
|
38
|
+
help='The name of a subreddit, without the "r\\", e.g. "machinelearning"',
|
|
39
|
+
),
|
|
40
|
+
click.Option(
|
|
41
|
+
["--search-query"],
|
|
42
|
+
default=None,
|
|
43
|
+
type=str,
|
|
44
|
+
help="If set, return posts using this query. Otherwise, use hot posts.",
|
|
45
|
+
),
|
|
46
|
+
click.Option(
|
|
47
|
+
["--num-posts"],
|
|
48
|
+
required=True,
|
|
49
|
+
type=click.IntRange(0),
|
|
50
|
+
help="If set, limits the number of posts to pull in.",
|
|
51
|
+
),
|
|
52
|
+
click.Option(
|
|
53
|
+
["--user-agent"],
|
|
54
|
+
required=True,
|
|
55
|
+
type=str,
|
|
56
|
+
help="user agent request header to use when calling Reddit API",
|
|
57
|
+
),
|
|
58
|
+
]
|
|
59
|
+
return options
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
63
|
+
cmd_cls = BaseSrcCmd(
|
|
64
|
+
cmd_name="reddit",
|
|
65
|
+
cli_config=RedditCliConfig,
|
|
66
|
+
)
|
|
67
|
+
return cmd_cls
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
CliRecursiveConfig,
|
|
10
|
+
DelimitedString,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.connector.salesforce import SimpleSalesforceConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class SalesforceCliConfig(SimpleSalesforceConfig, CliConfig):
|
|
17
|
+
@staticmethod
|
|
18
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
19
|
+
possible_categories = ["Account", "Case", "Campaign", "EmailMessage", "Lead"]
|
|
20
|
+
options = [
|
|
21
|
+
click.Option(
|
|
22
|
+
["--username"],
|
|
23
|
+
required=True,
|
|
24
|
+
type=str,
|
|
25
|
+
help="Salesforce username usually looks like an email.",
|
|
26
|
+
),
|
|
27
|
+
click.Option(
|
|
28
|
+
["--consumer-key"],
|
|
29
|
+
required=True,
|
|
30
|
+
type=str,
|
|
31
|
+
help="For the Salesforce JWT auth. Found in Consumer Details.",
|
|
32
|
+
),
|
|
33
|
+
click.Option(
|
|
34
|
+
["--private-key"],
|
|
35
|
+
required=True,
|
|
36
|
+
type=str,
|
|
37
|
+
help="Path to the private key or its contents for the Salesforce JWT auth. "
|
|
38
|
+
"Key file is usually named server.key.",
|
|
39
|
+
),
|
|
40
|
+
click.Option(
|
|
41
|
+
["--categories"],
|
|
42
|
+
default=None,
|
|
43
|
+
required=True,
|
|
44
|
+
type=DelimitedString(choices=possible_categories),
|
|
45
|
+
help="Comma-delimited salesforce categories to download. "
|
|
46
|
+
"Currently only {}.".format(", ".join(possible_categories)),
|
|
47
|
+
),
|
|
48
|
+
]
|
|
49
|
+
return options
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
53
|
+
cmd_cls = BaseSrcCmd(
|
|
54
|
+
cmd_name="salesforce",
|
|
55
|
+
cli_config=SalesforceCliConfig,
|
|
56
|
+
additional_cli_options=[CliRecursiveConfig],
|
|
57
|
+
)
|
|
58
|
+
return cmd_cls
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
CliRecursiveConfig,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.connector.sharepoint import SimpleSharepointConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class SharepointCliConfig(SimpleSharepointConfig, CliConfig):
|
|
16
|
+
@staticmethod
|
|
17
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
18
|
+
options = [
|
|
19
|
+
click.Option(
|
|
20
|
+
["--client-id"],
|
|
21
|
+
default=None,
|
|
22
|
+
type=str,
|
|
23
|
+
help="Sharepoint app client ID",
|
|
24
|
+
),
|
|
25
|
+
click.Option(
|
|
26
|
+
["--client-cred"],
|
|
27
|
+
default=None,
|
|
28
|
+
type=str,
|
|
29
|
+
help="Sharepoint app secret",
|
|
30
|
+
),
|
|
31
|
+
click.Option(
|
|
32
|
+
["--site"],
|
|
33
|
+
default=None,
|
|
34
|
+
type=str,
|
|
35
|
+
help="Sharepoint site url. Process either base url e.g \
|
|
36
|
+
https://[tenant].sharepoint.com or relative sites \
|
|
37
|
+
https://[tenant].sharepoint.com/sites/<site_name>. \
|
|
38
|
+
To process all sites within the tenant pass a site url as \
|
|
39
|
+
https://[tenant]-admin.sharepoint.com.\
|
|
40
|
+
This requires the app to be registered at a tenant level",
|
|
41
|
+
),
|
|
42
|
+
click.Option(
|
|
43
|
+
["--path"],
|
|
44
|
+
default="Shared Documents",
|
|
45
|
+
type=str,
|
|
46
|
+
help="Path from which to start parsing files. If the connector is to \
|
|
47
|
+
process all sites within the tenant this filter will be applied to \
|
|
48
|
+
all sites document libraries. Default 'Shared Documents'",
|
|
49
|
+
),
|
|
50
|
+
click.Option(
|
|
51
|
+
["--files-only"],
|
|
52
|
+
is_flag=True,
|
|
53
|
+
default=False,
|
|
54
|
+
help="Process only files.",
|
|
55
|
+
),
|
|
56
|
+
]
|
|
57
|
+
return options
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
61
|
+
cmd_cls = BaseSrcCmd(
|
|
62
|
+
cmd_name="sharepoint",
|
|
63
|
+
cli_config=SharepointCliConfig,
|
|
64
|
+
additional_cli_options=[CliRecursiveConfig],
|
|
65
|
+
)
|
|
66
|
+
return cmd_cls
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
DelimitedString,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.connector.slack import SimpleSlackConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class SlackCliConfig(SimpleSlackConfig, CliConfig):
|
|
16
|
+
@staticmethod
|
|
17
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
18
|
+
options = [
|
|
19
|
+
click.Option(
|
|
20
|
+
["--token"],
|
|
21
|
+
required=True,
|
|
22
|
+
type=str,
|
|
23
|
+
help="Bot token used to access Slack API, must have channels:history "
|
|
24
|
+
"scope for the bot user",
|
|
25
|
+
),
|
|
26
|
+
click.Option(
|
|
27
|
+
["--channels"],
|
|
28
|
+
required=True,
|
|
29
|
+
type=DelimitedString(),
|
|
30
|
+
help="Comma-delimited list of Slack channel IDs to pull messages from, "
|
|
31
|
+
"can be a public or private channel",
|
|
32
|
+
),
|
|
33
|
+
click.Option(
|
|
34
|
+
["--start-date"],
|
|
35
|
+
default=None,
|
|
36
|
+
type=str,
|
|
37
|
+
help="Start date/time in formats YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or "
|
|
38
|
+
"YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SStz",
|
|
39
|
+
),
|
|
40
|
+
click.Option(
|
|
41
|
+
["--end-date"],
|
|
42
|
+
default=None,
|
|
43
|
+
type=str,
|
|
44
|
+
help="End date/time in formats YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or "
|
|
45
|
+
"YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SStz",
|
|
46
|
+
),
|
|
47
|
+
]
|
|
48
|
+
return options
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
52
|
+
cmd_cls = BaseSrcCmd(
|
|
53
|
+
cmd_name="slack",
|
|
54
|
+
cli_config=SlackCliConfig,
|
|
55
|
+
)
|
|
56
|
+
return cmd_cls
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.interfaces import CliConfig
|
|
7
|
+
from unstructured_ingest.connector.sql import SimpleSqlConfig
|
|
8
|
+
from unstructured_ingest.interfaces import WriteConfig
|
|
9
|
+
|
|
10
|
+
SQL_DRIVERS = {"postgresql", "sqlite"}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class SqlCliConfig(SimpleSqlConfig, CliConfig):
|
|
15
|
+
@staticmethod
|
|
16
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
17
|
+
options = [
|
|
18
|
+
click.Option(
|
|
19
|
+
["--db-type"],
|
|
20
|
+
required=True,
|
|
21
|
+
type=click.Choice(SQL_DRIVERS),
|
|
22
|
+
help="Type of the database backend",
|
|
23
|
+
),
|
|
24
|
+
click.Option(
|
|
25
|
+
["--username"],
|
|
26
|
+
default=None,
|
|
27
|
+
type=str,
|
|
28
|
+
help="DB username",
|
|
29
|
+
),
|
|
30
|
+
click.Option(
|
|
31
|
+
["--password"],
|
|
32
|
+
default=None,
|
|
33
|
+
type=str,
|
|
34
|
+
help="DB password",
|
|
35
|
+
),
|
|
36
|
+
click.Option(
|
|
37
|
+
["--host"],
|
|
38
|
+
default=None,
|
|
39
|
+
type=str,
|
|
40
|
+
help="DB host",
|
|
41
|
+
),
|
|
42
|
+
click.Option(
|
|
43
|
+
["--port"],
|
|
44
|
+
default=None,
|
|
45
|
+
type=int,
|
|
46
|
+
help="DB host connection port",
|
|
47
|
+
),
|
|
48
|
+
click.Option(
|
|
49
|
+
["--database"],
|
|
50
|
+
default=None,
|
|
51
|
+
type=str,
|
|
52
|
+
help="Database name. For sqlite databases, this is the path to the .db file.",
|
|
53
|
+
),
|
|
54
|
+
]
|
|
55
|
+
return options
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_base_dest_cmd():
|
|
59
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
60
|
+
|
|
61
|
+
cmd_cls = BaseDestCmd(
|
|
62
|
+
cmd_name="sql",
|
|
63
|
+
cli_config=SqlCliConfig,
|
|
64
|
+
write_config=WriteConfig,
|
|
65
|
+
)
|
|
66
|
+
return cmd_cls
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.interfaces import CliConfig
|
|
7
|
+
from unstructured_ingest.connector.vectara import SimpleVectaraConfig, WriteConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class VectaraCliWriteConfig(SimpleVectaraConfig, CliConfig):
|
|
12
|
+
@staticmethod
|
|
13
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
14
|
+
options = [
|
|
15
|
+
click.Option(
|
|
16
|
+
["--customer-id"],
|
|
17
|
+
required=True,
|
|
18
|
+
type=str,
|
|
19
|
+
help="The Vectara customer-id.",
|
|
20
|
+
envvar="VECTARA_CUSTOMER_ID",
|
|
21
|
+
show_envvar=True,
|
|
22
|
+
),
|
|
23
|
+
click.Option(
|
|
24
|
+
["--oauth-client-id"],
|
|
25
|
+
required=True,
|
|
26
|
+
type=str,
|
|
27
|
+
help="Vectara OAuth2 client ID.",
|
|
28
|
+
envvar="VECTARA_OAUTH_CLIENT_ID",
|
|
29
|
+
show_envvar=True,
|
|
30
|
+
),
|
|
31
|
+
click.Option(
|
|
32
|
+
["--oauth-secret"],
|
|
33
|
+
required=True,
|
|
34
|
+
type=str,
|
|
35
|
+
help="Vectara OAuth2 secret.",
|
|
36
|
+
envvar="VECTARA_OAUTH_SECRET",
|
|
37
|
+
show_envvar=True,
|
|
38
|
+
),
|
|
39
|
+
click.Option(
|
|
40
|
+
["--corpus-name"],
|
|
41
|
+
required=False,
|
|
42
|
+
type=str,
|
|
43
|
+
default=None,
|
|
44
|
+
help="The Vectara corpus-name.",
|
|
45
|
+
),
|
|
46
|
+
click.Option(
|
|
47
|
+
["--token-url"],
|
|
48
|
+
required=False,
|
|
49
|
+
default="https://vectara-prod-{}.auth.us-west-2.amazoncognito.com/oauth2/token",
|
|
50
|
+
type=str,
|
|
51
|
+
help="The Vectara endpoint for token refresh. Needs curly brackets for customer_id",
|
|
52
|
+
),
|
|
53
|
+
]
|
|
54
|
+
return options
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_base_dest_cmd():
|
|
58
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
59
|
+
|
|
60
|
+
cmd_cls = BaseDestCmd(
|
|
61
|
+
cmd_name="vectara",
|
|
62
|
+
cli_config=VectaraCliWriteConfig,
|
|
63
|
+
additional_cli_options=[],
|
|
64
|
+
write_config=WriteConfig,
|
|
65
|
+
)
|
|
66
|
+
return cmd_cls
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.interfaces import CliConfig, DelimitedString
|
|
7
|
+
from unstructured_ingest.connector.weaviate import SimpleWeaviateConfig, WeaviateWriteConfig
|
|
8
|
+
|
|
9
|
+
CMD_NAME = "weaviate"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class WeaviateCliConfig(SimpleWeaviateConfig, CliConfig):
|
|
14
|
+
@staticmethod
|
|
15
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
16
|
+
options = [
|
|
17
|
+
click.Option(
|
|
18
|
+
["--host-url"],
|
|
19
|
+
required=True,
|
|
20
|
+
help="Weaviate instance url",
|
|
21
|
+
),
|
|
22
|
+
click.Option(
|
|
23
|
+
["--class-name"],
|
|
24
|
+
default=None,
|
|
25
|
+
type=str,
|
|
26
|
+
help="Name of the class to push the records into, e.g: Pdf-elements",
|
|
27
|
+
),
|
|
28
|
+
click.Option(
|
|
29
|
+
["--access-token"], default=None, type=str, help="Used to create the bearer token."
|
|
30
|
+
),
|
|
31
|
+
click.Option(
|
|
32
|
+
["--refresh-token"],
|
|
33
|
+
default=None,
|
|
34
|
+
type=str,
|
|
35
|
+
help="Will tie this value to the bearer token. If not provided, "
|
|
36
|
+
"the authentication will expire once the lifetime of the access token is up.",
|
|
37
|
+
),
|
|
38
|
+
click.Option(
|
|
39
|
+
["--api-key"],
|
|
40
|
+
default=None,
|
|
41
|
+
type=str,
|
|
42
|
+
),
|
|
43
|
+
click.Option(
|
|
44
|
+
["--client-secret"],
|
|
45
|
+
default=None,
|
|
46
|
+
type=str,
|
|
47
|
+
),
|
|
48
|
+
click.Option(
|
|
49
|
+
["--scope"],
|
|
50
|
+
default=None,
|
|
51
|
+
type=DelimitedString(),
|
|
52
|
+
),
|
|
53
|
+
click.Option(
|
|
54
|
+
["--username"],
|
|
55
|
+
default=None,
|
|
56
|
+
type=str,
|
|
57
|
+
),
|
|
58
|
+
click.Option(
|
|
59
|
+
["--password"],
|
|
60
|
+
default=None,
|
|
61
|
+
type=str,
|
|
62
|
+
),
|
|
63
|
+
click.Option(
|
|
64
|
+
["--anonymous"],
|
|
65
|
+
is_flag=True,
|
|
66
|
+
default=False,
|
|
67
|
+
type=bool,
|
|
68
|
+
help="if set, all auth values will be ignored",
|
|
69
|
+
),
|
|
70
|
+
]
|
|
71
|
+
return options
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class WeaviateCliWriteConfig(WeaviateWriteConfig, CliConfig):
|
|
76
|
+
@staticmethod
|
|
77
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
78
|
+
options = [
|
|
79
|
+
click.Option(
|
|
80
|
+
["--batch-size"],
|
|
81
|
+
default=100,
|
|
82
|
+
type=int,
|
|
83
|
+
help="Number of records per batch",
|
|
84
|
+
)
|
|
85
|
+
]
|
|
86
|
+
return options
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_base_dest_cmd():
|
|
90
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
91
|
+
|
|
92
|
+
cmd_cls = BaseDestCmd(
|
|
93
|
+
cmd_name=CMD_NAME,
|
|
94
|
+
cli_config=WeaviateCliConfig,
|
|
95
|
+
additional_cli_options=[WeaviateCliWriteConfig],
|
|
96
|
+
write_config=WeaviateWriteConfig,
|
|
97
|
+
)
|
|
98
|
+
return cmd_cls
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.connector.wikipedia import SimpleWikipediaConfig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class WikipediaCliConfig(SimpleWikipediaConfig, CliConfig):
|
|
15
|
+
@staticmethod
|
|
16
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
17
|
+
options = [
|
|
18
|
+
click.Option(
|
|
19
|
+
["--page-title"],
|
|
20
|
+
required=True,
|
|
21
|
+
type=str,
|
|
22
|
+
help='Title of a Wikipedia page, e.g. "Open source software".',
|
|
23
|
+
),
|
|
24
|
+
click.Option(
|
|
25
|
+
["--auto-suggest"],
|
|
26
|
+
default=True,
|
|
27
|
+
is_flag=True,
|
|
28
|
+
help="Whether to automatically suggest a page if the exact page was not found."
|
|
29
|
+
" Set to False if the wrong Wikipedia page is fetched.",
|
|
30
|
+
),
|
|
31
|
+
]
|
|
32
|
+
return options
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
36
|
+
cmd_cls = BaseSrcCmd(
|
|
37
|
+
cmd_name="wikipedia",
|
|
38
|
+
cli_config=WikipediaCliConfig,
|
|
39
|
+
)
|
|
40
|
+
return cmd_cls
|