unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.0" # pragma: no cover
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.cli.cmds import base_dest_cmd_fns, base_src_cmd_fns
|
|
6
|
+
|
|
7
|
+
src: t.List[click.Group] = [v().get_src_cmd() for v in base_src_cmd_fns]
|
|
8
|
+
|
|
9
|
+
dest: t.List[click.Command] = [v().get_dest_cmd() for v in base_dest_cmd_fns]
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"src",
|
|
13
|
+
"dest",
|
|
14
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.cli.interfaces import CliConfig
|
|
6
|
+
from unstructured_ingest.interfaces import BaseConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class BaseCmd(ABC):
|
|
11
|
+
cmd_name: str
|
|
12
|
+
cli_config: t.Optional[t.Type[BaseConfig]] = None
|
|
13
|
+
additional_cli_options: t.List[t.Type[CliConfig]] = field(default_factory=list)
|
|
14
|
+
addition_configs: t.Dict[str, t.Type[BaseConfig]] = field(default_factory=dict)
|
|
15
|
+
is_fsspec: bool = False
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def cmd_name_key(self):
|
|
19
|
+
return self.cmd_name.replace("-", "_")
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import typing as t
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.cli.base.cmd import BaseCmd
|
|
8
|
+
from unstructured_ingest.cli.cmd_factory import get_src_cmd
|
|
9
|
+
from unstructured_ingest.cli.common import (
|
|
10
|
+
log_options,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.cli.interfaces import BaseConfig, CliFilesStorageConfig
|
|
13
|
+
from unstructured_ingest.cli.utils import (
|
|
14
|
+
add_options,
|
|
15
|
+
conform_click_options,
|
|
16
|
+
extract_config,
|
|
17
|
+
extract_configs,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.logger import ingest_log_streaming_init, logger
|
|
20
|
+
from unstructured_ingest.runner.writers import writer_map
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class BaseDestCmd(BaseCmd):
|
|
25
|
+
write_config: t.Optional[t.Type[BaseConfig]] = None
|
|
26
|
+
|
|
27
|
+
def get_dest_runner(self, source_cmd: str, options: dict, parent_options: dict):
|
|
28
|
+
src_cmd_fn = get_src_cmd(cmd_name=source_cmd)
|
|
29
|
+
src_cmd = src_cmd_fn()
|
|
30
|
+
runner = src_cmd.get_source_runner(options=parent_options)
|
|
31
|
+
addition_configs = self.addition_configs
|
|
32
|
+
if "connector_config" not in addition_configs:
|
|
33
|
+
addition_configs["connector_config"] = self.cli_config
|
|
34
|
+
if self.write_config:
|
|
35
|
+
addition_configs["write_config"] = self.write_config
|
|
36
|
+
configs = extract_configs(
|
|
37
|
+
options,
|
|
38
|
+
validate=[self.cli_config] if self.cli_config else None,
|
|
39
|
+
extras=addition_configs,
|
|
40
|
+
add_defaults=False,
|
|
41
|
+
)
|
|
42
|
+
writer_cls = writer_map[self.cmd_name_key]
|
|
43
|
+
writer = writer_cls(**configs) # type: ignore
|
|
44
|
+
runner.writer = writer
|
|
45
|
+
runner.writer_kwargs = options
|
|
46
|
+
return runner
|
|
47
|
+
|
|
48
|
+
def check_dest_options(self, options: dict):
|
|
49
|
+
extract_config(flat_data=options, config=self.cli_config)
|
|
50
|
+
|
|
51
|
+
def dest(self, ctx: click.Context, **options):
|
|
52
|
+
if not ctx.parent:
|
|
53
|
+
raise click.ClickException("destination command called without a parent")
|
|
54
|
+
if not ctx.parent.info_name:
|
|
55
|
+
raise click.ClickException("parent command missing info name")
|
|
56
|
+
source_cmd = ctx.parent.info_name.replace("-", "_")
|
|
57
|
+
parent_options: dict = ctx.parent.params if ctx.parent else {}
|
|
58
|
+
conform_click_options(options)
|
|
59
|
+
verbose = parent_options.get("verbose", False)
|
|
60
|
+
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
|
61
|
+
log_options(parent_options, verbose=verbose)
|
|
62
|
+
log_options(options, verbose=verbose)
|
|
63
|
+
try:
|
|
64
|
+
self.check_dest_options(options=options)
|
|
65
|
+
runner = self.get_dest_runner(
|
|
66
|
+
source_cmd=source_cmd,
|
|
67
|
+
options=options,
|
|
68
|
+
parent_options=parent_options,
|
|
69
|
+
)
|
|
70
|
+
runner.run(**parent_options)
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.error(e, exc_info=True)
|
|
73
|
+
raise click.ClickException(str(e)) from e
|
|
74
|
+
|
|
75
|
+
def get_dest_cmd(self) -> click.Command:
|
|
76
|
+
# Dynamically create the command without the use of click decorators
|
|
77
|
+
fn = self.dest
|
|
78
|
+
fn = click.pass_context(fn)
|
|
79
|
+
cmd: click.Group = click.command(fn)
|
|
80
|
+
cmd.name = self.cmd_name
|
|
81
|
+
cmd.invoke_without_command = True
|
|
82
|
+
options = [self.cli_config] if self.cli_config else []
|
|
83
|
+
options += self.additional_cli_options
|
|
84
|
+
if self.is_fsspec and CliFilesStorageConfig not in options:
|
|
85
|
+
options.append(CliFilesStorageConfig)
|
|
86
|
+
add_options(cmd, extras=options, is_src=False)
|
|
87
|
+
return cmd
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.cmd import BaseCmd
|
|
7
|
+
from unstructured_ingest.cli.common import (
|
|
8
|
+
log_options,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.cli.interfaces import CliFilesStorageConfig
|
|
11
|
+
from unstructured_ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
|
12
|
+
from unstructured_ingest.logger import ingest_log_streaming_init, logger
|
|
13
|
+
from unstructured_ingest.runner import runner_map
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class BaseSrcCmd(BaseCmd):
|
|
18
|
+
def get_source_runner(self, options: dict):
|
|
19
|
+
addition_configs = self.addition_configs
|
|
20
|
+
if "connector_config" not in addition_configs:
|
|
21
|
+
addition_configs["connector_config"] = self.cli_config
|
|
22
|
+
configs = extract_configs(
|
|
23
|
+
options,
|
|
24
|
+
validate=[self.cli_config] if self.cli_config else None,
|
|
25
|
+
extras=addition_configs,
|
|
26
|
+
)
|
|
27
|
+
runner = runner_map[self.cmd_name_key]
|
|
28
|
+
return runner(**configs) # type: ignore
|
|
29
|
+
|
|
30
|
+
def src(self, ctx: click.Context, **options):
|
|
31
|
+
if ctx.invoked_subcommand:
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
conform_click_options(options)
|
|
35
|
+
verbose = options.get("verbose", False)
|
|
36
|
+
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
|
37
|
+
log_options(options, verbose=verbose)
|
|
38
|
+
try:
|
|
39
|
+
runner = self.get_source_runner(options=options)
|
|
40
|
+
runner.run(**options)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.error(e, exc_info=True)
|
|
43
|
+
raise click.ClickException(str(e)) from e
|
|
44
|
+
|
|
45
|
+
def get_src_cmd(self) -> click.Group:
|
|
46
|
+
# Dynamically create the command without the use of click decorators
|
|
47
|
+
fn = self.src
|
|
48
|
+
fn = click.pass_context(fn)
|
|
49
|
+
cmd: click.Group = click.group(fn, cls=Group)
|
|
50
|
+
cmd.name = self.cmd_name
|
|
51
|
+
cmd.invoke_without_command = True
|
|
52
|
+
extra_options = [self.cli_config] if self.cli_config else []
|
|
53
|
+
extra_options += self.additional_cli_options
|
|
54
|
+
if self.is_fsspec and CliFilesStorageConfig not in extra_options:
|
|
55
|
+
extra_options.append(CliFilesStorageConfig)
|
|
56
|
+
add_options(cmd, extras=extra_options)
|
|
57
|
+
return cmd
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import click
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.cli import dest, src
|
|
4
|
+
from unstructured_ingest.v2.cli.cmds import dest as dest_v2
|
|
5
|
+
from unstructured_ingest.v2.cli.cmds import src as src_v2
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@click.group()
|
|
9
|
+
def ingest():
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_cmd() -> click.Command:
|
|
14
|
+
"""Construct and return a Click command object representing the main command for the CLI.
|
|
15
|
+
|
|
16
|
+
This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
|
|
17
|
+
to the main command as nested subcommands.
|
|
18
|
+
"""
|
|
19
|
+
cmd = ingest
|
|
20
|
+
src_dict = {s.name: s for s in src}
|
|
21
|
+
dest_dict = {d.name: d for d in dest}
|
|
22
|
+
for s in src_v2:
|
|
23
|
+
src_dict[s.name] = s
|
|
24
|
+
for d in dest_v2:
|
|
25
|
+
dest_dict[d.name] = d
|
|
26
|
+
# Add all subcommands
|
|
27
|
+
for src_subcommand in src_dict.values():
|
|
28
|
+
# Add all destination subcommands
|
|
29
|
+
for dest_subcommand in dest_dict.values():
|
|
30
|
+
src_subcommand.add_command(dest_subcommand)
|
|
31
|
+
cmd.add_command(src_subcommand)
|
|
32
|
+
return cmd
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
4
|
+
from unstructured_ingest.cli.cmds import base_src_cmd_fns
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_src_cmd_map() -> t.Dict[str, t.Callable[[], BaseSrcCmd]]:
|
|
8
|
+
return {b().cmd_name_key: b for b in base_src_cmd_fns}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_src_cmd(cmd_name: str) -> t.Callable[[], BaseSrcCmd]:
|
|
12
|
+
return get_src_cmd_map()[cmd_name]
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import collections
|
|
4
|
+
import typing as t
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd
|
|
8
|
+
|
|
9
|
+
from .airtable import get_base_src_cmd as airtable_base_src_cmd
|
|
10
|
+
from .astra import get_base_dest_cmd as astra_base_dest_cmd
|
|
11
|
+
from .astra import get_base_src_cmd as astra_base_src_cmd
|
|
12
|
+
from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd
|
|
13
|
+
from .biomed import get_base_src_cmd as biomed_base_src_cmd
|
|
14
|
+
from .chroma import get_base_dest_cmd as chroma_base_dest_cmd
|
|
15
|
+
from .clarifai import get_base_dest_cmd as clarifai_base_dest_cmd
|
|
16
|
+
from .confluence import get_base_src_cmd as confluence_base_src_cmd
|
|
17
|
+
from .databricks_volumes import get_base_dest_cmd as databricks_volumes_dest_cmd
|
|
18
|
+
from .delta_table import get_base_dest_cmd as delta_table_dest_cmd
|
|
19
|
+
from .delta_table import get_base_src_cmd as delta_table_base_src_cmd
|
|
20
|
+
from .discord import get_base_src_cmd as discord_base_src_cmd
|
|
21
|
+
from .elasticsearch import get_base_dest_cmd as elasticsearch_base_dest_cmd
|
|
22
|
+
from .elasticsearch import get_base_src_cmd as elasticsearch_base_src_cmd
|
|
23
|
+
from .fsspec.azure import get_base_dest_cmd as azure_base_dest_cmd
|
|
24
|
+
from .fsspec.azure import get_base_src_cmd as azure_base_src_cmd
|
|
25
|
+
from .fsspec.box import get_base_dest_cmd as box_base_dest_cmd
|
|
26
|
+
from .fsspec.box import get_base_src_cmd as box_base_src_cmd
|
|
27
|
+
from .fsspec.dropbox import get_base_dest_cmd as dropbox_base_dest_cmd
|
|
28
|
+
from .fsspec.dropbox import get_base_src_cmd as dropbox_base_src_cmd
|
|
29
|
+
from .fsspec.fsspec import get_base_dest_cmd as fsspec_base_dest_cmd
|
|
30
|
+
from .fsspec.fsspec import get_base_src_cmd as fsspec_base_src_cmd
|
|
31
|
+
from .fsspec.gcs import get_base_dest_cmd as gcs_base_dest_cmd
|
|
32
|
+
from .fsspec.gcs import get_base_src_cmd as gcs_base_src_cmd
|
|
33
|
+
from .fsspec.s3 import get_base_dest_cmd as s3_base_dest_cmd
|
|
34
|
+
from .fsspec.s3 import get_base_src_cmd as s3_base_src_cmd
|
|
35
|
+
from .github import get_base_src_cmd as github_base_src_cmd
|
|
36
|
+
from .gitlab import get_base_src_cmd as gitlab_base_src_cmd
|
|
37
|
+
from .google_drive import get_base_src_cmd as google_drive_base_src_cmd
|
|
38
|
+
from .hubspot import get_base_src_cmd as hubspot_base_src_cmd
|
|
39
|
+
from .jira import get_base_src_cmd as jira_base_src_cmd
|
|
40
|
+
from .kafka import get_base_dest_cmd as kafka_base_dest_cmd
|
|
41
|
+
from .kafka import get_base_src_cmd as kafka_base_src_cmd
|
|
42
|
+
from .local import get_base_src_cmd as local_base_src_cmd
|
|
43
|
+
from .mongodb import get_base_dest_cmd as mongo_base_dest_cmd
|
|
44
|
+
from .mongodb import get_base_src_cmd as mongodb_base_src_cmd
|
|
45
|
+
from .notion import get_base_src_cmd as notion_base_src_cmd
|
|
46
|
+
from .onedrive import get_base_src_cmd as onedrive_base_src_cmd
|
|
47
|
+
from .opensearch import get_base_dest_cmd as opensearch_base_dest_cmd
|
|
48
|
+
from .opensearch import get_base_src_cmd as opensearch_base_src_cmd
|
|
49
|
+
from .outlook import get_base_src_cmd as outlook_base_src_cmd
|
|
50
|
+
from .pinecone import get_base_dest_cmd as pinecone_base_dest_cmd
|
|
51
|
+
from .qdrant import get_base_dest_cmd as qdrant_base_dest_cmd
|
|
52
|
+
from .reddit import get_base_src_cmd as reddit_base_src_cmd
|
|
53
|
+
from .salesforce import get_base_src_cmd as salesforce_base_src_cmd
|
|
54
|
+
from .sharepoint import get_base_src_cmd as sharepoint_base_src_cmd
|
|
55
|
+
from .slack import get_base_src_cmd as slack_base_src_cmd
|
|
56
|
+
from .sql import get_base_dest_cmd as sql_base_dest_cmd
|
|
57
|
+
from .vectara import get_base_dest_cmd as vectara_base_dest_cmd
|
|
58
|
+
from .weaviate import get_base_dest_cmd as weaviate_dest_cmd
|
|
59
|
+
from .wikipedia import get_base_src_cmd as wikipedia_base_src_cmd
|
|
60
|
+
|
|
61
|
+
if t.TYPE_CHECKING:
|
|
62
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
63
|
+
|
|
64
|
+
base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [
|
|
65
|
+
airtable_base_src_cmd,
|
|
66
|
+
astra_base_src_cmd,
|
|
67
|
+
azure_base_src_cmd,
|
|
68
|
+
biomed_base_src_cmd,
|
|
69
|
+
box_base_src_cmd,
|
|
70
|
+
confluence_base_src_cmd,
|
|
71
|
+
delta_table_base_src_cmd,
|
|
72
|
+
discord_base_src_cmd,
|
|
73
|
+
dropbox_base_src_cmd,
|
|
74
|
+
elasticsearch_base_src_cmd,
|
|
75
|
+
fsspec_base_src_cmd,
|
|
76
|
+
gcs_base_src_cmd,
|
|
77
|
+
github_base_src_cmd,
|
|
78
|
+
gitlab_base_src_cmd,
|
|
79
|
+
google_drive_base_src_cmd,
|
|
80
|
+
hubspot_base_src_cmd,
|
|
81
|
+
jira_base_src_cmd,
|
|
82
|
+
kafka_base_src_cmd,
|
|
83
|
+
local_base_src_cmd,
|
|
84
|
+
mongodb_base_src_cmd,
|
|
85
|
+
notion_base_src_cmd,
|
|
86
|
+
onedrive_base_src_cmd,
|
|
87
|
+
opensearch_base_src_cmd,
|
|
88
|
+
outlook_base_src_cmd,
|
|
89
|
+
reddit_base_src_cmd,
|
|
90
|
+
salesforce_base_src_cmd,
|
|
91
|
+
sftp_base_src_cmd,
|
|
92
|
+
sharepoint_base_src_cmd,
|
|
93
|
+
slack_base_src_cmd,
|
|
94
|
+
s3_base_src_cmd,
|
|
95
|
+
wikipedia_base_src_cmd,
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
# Make sure there are not overlapping names
|
|
99
|
+
src_cmd_names = [b().cmd_name for b in base_src_cmd_fns]
|
|
100
|
+
src_duplicates = [item for item, count in collections.Counter(src_cmd_names).items() if count > 1]
|
|
101
|
+
if src_duplicates:
|
|
102
|
+
raise ValueError(
|
|
103
|
+
"multiple base src commands defined with the same names: {}".format(
|
|
104
|
+
", ".join(src_duplicates),
|
|
105
|
+
),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
|
|
109
|
+
astra_base_dest_cmd,
|
|
110
|
+
azure_base_dest_cmd,
|
|
111
|
+
box_base_dest_cmd,
|
|
112
|
+
chroma_base_dest_cmd,
|
|
113
|
+
clarifai_base_dest_cmd,
|
|
114
|
+
databricks_volumes_dest_cmd,
|
|
115
|
+
dropbox_base_dest_cmd,
|
|
116
|
+
elasticsearch_base_dest_cmd,
|
|
117
|
+
fsspec_base_dest_cmd,
|
|
118
|
+
gcs_base_dest_cmd,
|
|
119
|
+
kafka_base_dest_cmd,
|
|
120
|
+
s3_base_dest_cmd,
|
|
121
|
+
azure_cognitive_search_base_dest_cmd,
|
|
122
|
+
delta_table_dest_cmd,
|
|
123
|
+
sql_base_dest_cmd,
|
|
124
|
+
weaviate_dest_cmd,
|
|
125
|
+
mongo_base_dest_cmd,
|
|
126
|
+
pinecone_base_dest_cmd,
|
|
127
|
+
qdrant_base_dest_cmd,
|
|
128
|
+
opensearch_base_dest_cmd,
|
|
129
|
+
vectara_base_dest_cmd,
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
# Make sure there are not overlapping names
|
|
133
|
+
dest_cmd_names = [b().cmd_name for b in base_dest_cmd_fns]
|
|
134
|
+
dest_duplicates = [item for item, count in collections.Counter(dest_cmd_names).items() if count > 1]
|
|
135
|
+
if dest_duplicates:
|
|
136
|
+
raise ValueError(
|
|
137
|
+
"multiple base dest commands defined with the same names: {}".format(
|
|
138
|
+
", ".join(dest_duplicates),
|
|
139
|
+
),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
__all__ = [
|
|
143
|
+
"base_src_cmd_fns",
|
|
144
|
+
"base_dest_cmd_fns",
|
|
145
|
+
]
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.connector.airtable import SimpleAirtableConfig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class AirtableCliConfig(SimpleAirtableConfig, CliConfig):
|
|
15
|
+
@staticmethod
|
|
16
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
17
|
+
options = [
|
|
18
|
+
click.Option(
|
|
19
|
+
["--personal-access-token"],
|
|
20
|
+
default=None,
|
|
21
|
+
help="Personal access token to authenticate into Airtable. Check: "
|
|
22
|
+
"https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
|
|
23
|
+
"for more info",
|
|
24
|
+
),
|
|
25
|
+
click.Option(
|
|
26
|
+
["--list-of-paths"],
|
|
27
|
+
default=None,
|
|
28
|
+
help="""
|
|
29
|
+
A list of paths that specify the locations to ingest data from within Airtable.
|
|
30
|
+
|
|
31
|
+
If this argument is not set, the connector ingests all tables within each and every base.
|
|
32
|
+
--list-of-paths: path1 path2 path3 ….
|
|
33
|
+
path: base_id/table_id(optional)/view_id(optional)/
|
|
34
|
+
|
|
35
|
+
To obtain (base, table, view) ids in bulk, check:
|
|
36
|
+
https://airtable.com/developers/web/api/list-bases (base ids)
|
|
37
|
+
https://airtable.com/developers/web/api/get-base-schema (table and view ids)
|
|
38
|
+
https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
|
|
39
|
+
|
|
40
|
+
To obtain specific ids from Airtable UI, go to your workspace, and copy any
|
|
41
|
+
relevant id from the URL structure:
|
|
42
|
+
https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
|
|
43
|
+
appAbcDeF1ghijKlm -> base_id
|
|
44
|
+
tblABcdEfG1HIJkLm -> table_id
|
|
45
|
+
viwABCDEfg6hijKLM -> view_id
|
|
46
|
+
|
|
47
|
+
You can also check: https://support.airtable.com/docs/finding-airtable-ids
|
|
48
|
+
|
|
49
|
+
Here is an example for one --list-of-paths:
|
|
50
|
+
base1/ → gets the entirety of all tables inside base1
|
|
51
|
+
base1/table1 → gets all rows and columns within table1 in base1
|
|
52
|
+
base1/table1/view1 → gets the rows and columns that are
|
|
53
|
+
visible in view1 for the table1 in base1
|
|
54
|
+
|
|
55
|
+
Examples to invalid airtable_paths:
|
|
56
|
+
table1 → has to mention base to be valid
|
|
57
|
+
base1/view1 → has to mention table to be valid
|
|
58
|
+
""",
|
|
59
|
+
),
|
|
60
|
+
]
|
|
61
|
+
return options
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
65
|
+
cmd_cls = BaseSrcCmd(
|
|
66
|
+
cmd_name="airtable",
|
|
67
|
+
cli_config=AirtableCliConfig,
|
|
68
|
+
)
|
|
69
|
+
return cmd_cls
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.interfaces import CliConfig, Dict
|
|
7
|
+
from unstructured_ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class AstraCliConfig(SimpleAstraConfig, CliConfig):
|
|
12
|
+
@staticmethod
|
|
13
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
14
|
+
options = [
|
|
15
|
+
click.Option(
|
|
16
|
+
["--token"],
|
|
17
|
+
required=True,
|
|
18
|
+
type=str,
|
|
19
|
+
help="Astra DB Token with access to the database.",
|
|
20
|
+
envvar="ASTRA_DB_APPLICATION_TOKEN",
|
|
21
|
+
show_envvar=True,
|
|
22
|
+
),
|
|
23
|
+
click.Option(
|
|
24
|
+
["--api-endpoint"],
|
|
25
|
+
required=True,
|
|
26
|
+
type=str,
|
|
27
|
+
help="The API endpoint for the Astra DB.",
|
|
28
|
+
envvar="ASTRA_DB_API_ENDPOINT",
|
|
29
|
+
show_envvar=True,
|
|
30
|
+
),
|
|
31
|
+
click.Option(
|
|
32
|
+
["--collection-name"],
|
|
33
|
+
required=False,
|
|
34
|
+
type=str,
|
|
35
|
+
help="The name of the Astra DB collection. "
|
|
36
|
+
"Note that the collection name must only include letters, "
|
|
37
|
+
"numbers, and underscores.",
|
|
38
|
+
),
|
|
39
|
+
click.Option(
|
|
40
|
+
["--namespace"],
|
|
41
|
+
required=False,
|
|
42
|
+
default=None,
|
|
43
|
+
type=str,
|
|
44
|
+
help="The Astra DB connection namespace.",
|
|
45
|
+
),
|
|
46
|
+
]
|
|
47
|
+
return options
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class AstraCliWriteConfig(AstraWriteConfig, CliConfig):
|
|
52
|
+
@staticmethod
|
|
53
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
54
|
+
options = [
|
|
55
|
+
click.Option(
|
|
56
|
+
["--embedding-dimension"],
|
|
57
|
+
required=True,
|
|
58
|
+
default=384,
|
|
59
|
+
type=int,
|
|
60
|
+
help="The dimensionality of the embeddings",
|
|
61
|
+
),
|
|
62
|
+
click.Option(
|
|
63
|
+
["--requested-indexing-policy"],
|
|
64
|
+
required=False,
|
|
65
|
+
default=None,
|
|
66
|
+
type=Dict(),
|
|
67
|
+
help="The indexing policy to use for the collection."
|
|
68
|
+
'example: \'{"deny": ["metadata"]}\' ',
|
|
69
|
+
),
|
|
70
|
+
click.Option(
|
|
71
|
+
["--batch-size"],
|
|
72
|
+
default=20,
|
|
73
|
+
type=int,
|
|
74
|
+
help="Number of records per batch",
|
|
75
|
+
),
|
|
76
|
+
]
|
|
77
|
+
return options
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_base_src_cmd():
|
|
81
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
82
|
+
|
|
83
|
+
cmd_cls = BaseSrcCmd(
|
|
84
|
+
cmd_name="astra",
|
|
85
|
+
cli_config=AstraCliConfig,
|
|
86
|
+
)
|
|
87
|
+
return cmd_cls
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_base_dest_cmd():
|
|
91
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
92
|
+
|
|
93
|
+
cmd_cls = BaseDestCmd(
|
|
94
|
+
cmd_name="astra",
|
|
95
|
+
cli_config=AstraCliConfig,
|
|
96
|
+
additional_cli_options=[AstraCliWriteConfig],
|
|
97
|
+
write_config=AstraWriteConfig,
|
|
98
|
+
)
|
|
99
|
+
return cmd_cls
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.interfaces import (
|
|
7
|
+
CliConfig,
|
|
8
|
+
)
|
|
9
|
+
from unstructured_ingest.connector.azure_cognitive_search import (
|
|
10
|
+
AzureCognitiveSearchWriteConfig,
|
|
11
|
+
SimpleAzureCognitiveSearchStorageConfig,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class AzureCognitiveSearchCliConfig(SimpleAzureCognitiveSearchStorageConfig, CliConfig):
|
|
17
|
+
@staticmethod
|
|
18
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
19
|
+
options = [
|
|
20
|
+
click.Option(
|
|
21
|
+
["--key"],
|
|
22
|
+
required=True,
|
|
23
|
+
type=str,
|
|
24
|
+
help="Key credential used for authenticating to an Azure service.",
|
|
25
|
+
envvar="AZURE_SEARCH_API_KEY",
|
|
26
|
+
show_envvar=True,
|
|
27
|
+
),
|
|
28
|
+
click.Option(
|
|
29
|
+
["--endpoint"],
|
|
30
|
+
required=True,
|
|
31
|
+
type=str,
|
|
32
|
+
help="The URL endpoint of an Azure search service. "
|
|
33
|
+
"In the form of https://{{service_name}}.search.windows.net",
|
|
34
|
+
envvar="AZURE_SEARCH_ENDPOINT",
|
|
35
|
+
show_envvar=True,
|
|
36
|
+
),
|
|
37
|
+
]
|
|
38
|
+
return options
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class AzureCognitiveSearchCliWriteConfig(AzureCognitiveSearchWriteConfig, CliConfig):
|
|
43
|
+
@staticmethod
|
|
44
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
45
|
+
options = [
|
|
46
|
+
click.Option(
|
|
47
|
+
["--index"],
|
|
48
|
+
required=True,
|
|
49
|
+
type=str,
|
|
50
|
+
help="The name of the index to connect to",
|
|
51
|
+
),
|
|
52
|
+
]
|
|
53
|
+
return options
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_base_dest_cmd():
|
|
57
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
58
|
+
|
|
59
|
+
cmd_cls = BaseDestCmd(
|
|
60
|
+
cmd_name="azure-cognitive-search",
|
|
61
|
+
cli_config=AzureCognitiveSearchCliConfig,
|
|
62
|
+
additional_cli_options=[AzureCognitiveSearchCliWriteConfig],
|
|
63
|
+
write_config=AzureCognitiveSearchCliWriteConfig,
|
|
64
|
+
)
|
|
65
|
+
return cmd_cls
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.connector.biomed import SimpleBiomedConfig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class BiomedCliConfig(SimpleBiomedConfig, CliConfig):
|
|
15
|
+
@staticmethod
|
|
16
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
17
|
+
options = [
|
|
18
|
+
click.Option(
|
|
19
|
+
["--api-id"],
|
|
20
|
+
default=None,
|
|
21
|
+
help="ID parameter for OA Web Service API.",
|
|
22
|
+
),
|
|
23
|
+
click.Option(
|
|
24
|
+
["--api-from"],
|
|
25
|
+
default=None,
|
|
26
|
+
help="From parameter for OA Web Service API.",
|
|
27
|
+
),
|
|
28
|
+
click.Option(
|
|
29
|
+
["--api-until"],
|
|
30
|
+
default=None,
|
|
31
|
+
help="Until parameter for OA Web Service API.",
|
|
32
|
+
),
|
|
33
|
+
click.Option(
|
|
34
|
+
["--path"],
|
|
35
|
+
default=None,
|
|
36
|
+
help="PMC Open Access FTP Directory Path.",
|
|
37
|
+
),
|
|
38
|
+
click.Option(
|
|
39
|
+
["--max-request-time"],
|
|
40
|
+
default=45,
|
|
41
|
+
help="(In seconds) Max request time to OA Web Service API.",
|
|
42
|
+
),
|
|
43
|
+
]
|
|
44
|
+
return options
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
48
|
+
cmd_cls = BaseSrcCmd(
|
|
49
|
+
cmd_name="biomed",
|
|
50
|
+
cli_config=BiomedCliConfig,
|
|
51
|
+
)
|
|
52
|
+
return cmd_cls
|