unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
CliRecursiveConfig,
|
|
10
|
+
DelimitedString,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.connector.local import SimpleLocalConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class LocalCliConfig(SimpleLocalConfig, CliConfig):
|
|
17
|
+
@staticmethod
|
|
18
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
19
|
+
options = [
|
|
20
|
+
click.Option(
|
|
21
|
+
["--input-path"],
|
|
22
|
+
required=True,
|
|
23
|
+
type=click.Path(file_okay=True, dir_okay=True, exists=True),
|
|
24
|
+
help="Path to the location in the local file system that will be processed.",
|
|
25
|
+
),
|
|
26
|
+
click.Option(
|
|
27
|
+
["--file-glob"],
|
|
28
|
+
default=None,
|
|
29
|
+
type=DelimitedString(),
|
|
30
|
+
help="A comma-separated list of file globs to limit which types of "
|
|
31
|
+
"local files are accepted, e.g. '*.html,*.txt'",
|
|
32
|
+
),
|
|
33
|
+
]
|
|
34
|
+
return options
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
38
|
+
cmd_cls = BaseSrcCmd(
|
|
39
|
+
cmd_name="local",
|
|
40
|
+
cli_config=LocalCliConfig,
|
|
41
|
+
additional_cli_options=[CliRecursiveConfig],
|
|
42
|
+
)
|
|
43
|
+
return cmd_cls
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import CliConfig, DelimitedString
|
|
8
|
+
from unstructured_ingest.connector.mongodb import SimpleMongoDBConfig
|
|
9
|
+
from unstructured_ingest.interfaces import WriteConfig
|
|
10
|
+
|
|
11
|
+
CMD_NAME = "mongodb"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class MongoDBCliConfig(SimpleMongoDBConfig, CliConfig):
|
|
16
|
+
@staticmethod
|
|
17
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
18
|
+
options = [
|
|
19
|
+
click.Option(
|
|
20
|
+
["--uri"],
|
|
21
|
+
help="URI to user when connecting",
|
|
22
|
+
),
|
|
23
|
+
click.Option(
|
|
24
|
+
["--host"],
|
|
25
|
+
type=DelimitedString(),
|
|
26
|
+
help="hostname or IP address or Unix domain socket path of a single mongod or "
|
|
27
|
+
"mongos instance to connect to, or a list of hostnames",
|
|
28
|
+
),
|
|
29
|
+
click.Option(["--port"], type=int, default=27017),
|
|
30
|
+
click.Option(
|
|
31
|
+
["--database"], type=str, required=True, help="database name to connect to"
|
|
32
|
+
),
|
|
33
|
+
click.Option(
|
|
34
|
+
["--collection"], required=True, type=str, help="collection name to connect to"
|
|
35
|
+
),
|
|
36
|
+
]
|
|
37
|
+
return options
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class MongoDBReadConfig(SimpleMongoDBConfig, CliConfig):
|
|
42
|
+
@staticmethod
|
|
43
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
44
|
+
options = [
|
|
45
|
+
click.Option(
|
|
46
|
+
["--batch-size"],
|
|
47
|
+
default=100,
|
|
48
|
+
type=click.IntRange(0),
|
|
49
|
+
help="how many records to read at a time per process",
|
|
50
|
+
),
|
|
51
|
+
]
|
|
52
|
+
return options
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
56
|
+
cmd_cls = BaseSrcCmd(
|
|
57
|
+
cmd_name=CMD_NAME,
|
|
58
|
+
cli_config=MongoDBCliConfig,
|
|
59
|
+
additional_cli_options=[MongoDBReadConfig],
|
|
60
|
+
)
|
|
61
|
+
return cmd_cls
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_base_dest_cmd():
|
|
65
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
66
|
+
|
|
67
|
+
cmd_cls = BaseDestCmd(
|
|
68
|
+
cmd_name=CMD_NAME,
|
|
69
|
+
cli_config=MongoDBCliConfig,
|
|
70
|
+
write_config=WriteConfig,
|
|
71
|
+
)
|
|
72
|
+
return cmd_cls
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
CliRecursiveConfig,
|
|
10
|
+
DelimitedString,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.connector.notion.connector import SimpleNotionConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class NotionCliConfig(SimpleNotionConfig, CliConfig):
|
|
17
|
+
@staticmethod
|
|
18
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
19
|
+
options = [
|
|
20
|
+
click.Option(
|
|
21
|
+
["--notion-api-key"],
|
|
22
|
+
required=True,
|
|
23
|
+
type=str,
|
|
24
|
+
help="API key for Notion api",
|
|
25
|
+
),
|
|
26
|
+
click.Option(
|
|
27
|
+
["--page-ids"],
|
|
28
|
+
default=None,
|
|
29
|
+
type=DelimitedString(),
|
|
30
|
+
help="Notion page IDs to pull text from",
|
|
31
|
+
),
|
|
32
|
+
click.Option(
|
|
33
|
+
["--database-ids"],
|
|
34
|
+
default=None,
|
|
35
|
+
type=DelimitedString(),
|
|
36
|
+
help="Notion database IDs to pull text from",
|
|
37
|
+
),
|
|
38
|
+
]
|
|
39
|
+
return options
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
43
|
+
cmd_cls = BaseSrcCmd(
|
|
44
|
+
cmd_name="notion",
|
|
45
|
+
cli_config=NotionCliConfig,
|
|
46
|
+
additional_cli_options=[CliRecursiveConfig],
|
|
47
|
+
)
|
|
48
|
+
return cmd_cls
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
CliRecursiveConfig,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.connector.onedrive import SimpleOneDriveConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class OnedriveCliConfig(SimpleOneDriveConfig, CliConfig):
|
|
16
|
+
@staticmethod
|
|
17
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
18
|
+
options = [
|
|
19
|
+
click.Option(
|
|
20
|
+
["--client-id"],
|
|
21
|
+
required=True,
|
|
22
|
+
type=str,
|
|
23
|
+
help="Microsoft app client ID",
|
|
24
|
+
),
|
|
25
|
+
click.Option(
|
|
26
|
+
["--client-cred"],
|
|
27
|
+
required=True,
|
|
28
|
+
type=str,
|
|
29
|
+
help="Microsoft App client secret",
|
|
30
|
+
),
|
|
31
|
+
click.Option(
|
|
32
|
+
["--user-pname"],
|
|
33
|
+
required=True,
|
|
34
|
+
type=str,
|
|
35
|
+
help="User principal name, usually is your Azure AD email.",
|
|
36
|
+
),
|
|
37
|
+
click.Option(
|
|
38
|
+
["--tenant"],
|
|
39
|
+
default="common",
|
|
40
|
+
type=str,
|
|
41
|
+
help="ID or domain name associated with your Azure AD instance",
|
|
42
|
+
),
|
|
43
|
+
click.Option(
|
|
44
|
+
["--path"],
|
|
45
|
+
default=None,
|
|
46
|
+
type=str,
|
|
47
|
+
help="Folder to start parsing files from.",
|
|
48
|
+
),
|
|
49
|
+
click.Option(
|
|
50
|
+
["--authority-url"],
|
|
51
|
+
default="https://login.microsoftonline.com",
|
|
52
|
+
type=str,
|
|
53
|
+
help="Authentication token provider for Microsoft apps, default is "
|
|
54
|
+
"https://login.microsoftonline.com",
|
|
55
|
+
),
|
|
56
|
+
]
|
|
57
|
+
return options
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
61
|
+
cmd_cls = BaseSrcCmd(
|
|
62
|
+
cmd_name="onedrive",
|
|
63
|
+
cli_config=OnedriveCliConfig,
|
|
64
|
+
additional_cli_options=[CliRecursiveConfig],
|
|
65
|
+
)
|
|
66
|
+
return cmd_cls
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.cmds.elasticsearch import ElasticsearchCliWriteConfig
|
|
8
|
+
from unstructured_ingest.cli.interfaces import CliConfig, DelimitedString
|
|
9
|
+
from unstructured_ingest.connector.opensearch import SimpleOpenSearchConfig
|
|
10
|
+
|
|
11
|
+
CMD_NAME = "opensearch"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class OpenSearchCliConfig(SimpleOpenSearchConfig, CliConfig):
|
|
16
|
+
@staticmethod
|
|
17
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
18
|
+
options = [
|
|
19
|
+
click.Option(
|
|
20
|
+
["--index-name"],
|
|
21
|
+
required=True,
|
|
22
|
+
type=str,
|
|
23
|
+
help="Name of the OpenSearch index to pull data from, or upload data to.",
|
|
24
|
+
),
|
|
25
|
+
click.Option(
|
|
26
|
+
["--hosts"],
|
|
27
|
+
type=DelimitedString(),
|
|
28
|
+
help='List of the OpenSearch hosts to connect to, e.g. "http://localhost:9200"',
|
|
29
|
+
),
|
|
30
|
+
click.Option(
|
|
31
|
+
["--fields"],
|
|
32
|
+
type=DelimitedString(),
|
|
33
|
+
default=[],
|
|
34
|
+
help="If provided, will limit the fields returned by OpenSearch "
|
|
35
|
+
"to this comma-delimited list",
|
|
36
|
+
),
|
|
37
|
+
click.Option(
|
|
38
|
+
["--username"], type=str, default=None, help="username when using basic auth"
|
|
39
|
+
),
|
|
40
|
+
click.Option(
|
|
41
|
+
["--password"],
|
|
42
|
+
type=str,
|
|
43
|
+
default=None,
|
|
44
|
+
help="password when using basic auth",
|
|
45
|
+
),
|
|
46
|
+
click.Option(
|
|
47
|
+
["--use-ssl"],
|
|
48
|
+
type=bool,
|
|
49
|
+
default=False,
|
|
50
|
+
is_flag=True,
|
|
51
|
+
help="use ssl for the connection",
|
|
52
|
+
),
|
|
53
|
+
click.Option(
|
|
54
|
+
["--verify-certs"],
|
|
55
|
+
type=bool,
|
|
56
|
+
default=False,
|
|
57
|
+
is_flag=True,
|
|
58
|
+
help="whether to verify SSL certificates",
|
|
59
|
+
),
|
|
60
|
+
click.Option(
|
|
61
|
+
["--ssl-show-warn"],
|
|
62
|
+
type=bool,
|
|
63
|
+
default=False,
|
|
64
|
+
is_flag=True,
|
|
65
|
+
help="show warning when verify certs is disabled",
|
|
66
|
+
),
|
|
67
|
+
click.Option(
|
|
68
|
+
["--ca-certs"],
|
|
69
|
+
type=click.Path(),
|
|
70
|
+
default=None,
|
|
71
|
+
help="path to CA bundle",
|
|
72
|
+
),
|
|
73
|
+
click.Option(
|
|
74
|
+
["--client-cert"],
|
|
75
|
+
type=click.Path(),
|
|
76
|
+
default=None,
|
|
77
|
+
help="path to the file containing the private key and the certificate,"
|
|
78
|
+
" or cert only if using client_key",
|
|
79
|
+
),
|
|
80
|
+
click.Option(
|
|
81
|
+
["--client-key"],
|
|
82
|
+
type=click.Path(),
|
|
83
|
+
default=None,
|
|
84
|
+
help="path to the file containing the private key"
|
|
85
|
+
" if using separate cert and key files",
|
|
86
|
+
),
|
|
87
|
+
click.Option(
|
|
88
|
+
["--batch-size"],
|
|
89
|
+
default=100,
|
|
90
|
+
type=click.IntRange(0),
|
|
91
|
+
help="how many records to read at a time per process",
|
|
92
|
+
),
|
|
93
|
+
]
|
|
94
|
+
return options
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
98
|
+
cmd_cls = BaseSrcCmd(
|
|
99
|
+
cmd_name="opensearch",
|
|
100
|
+
cli_config=OpenSearchCliConfig,
|
|
101
|
+
)
|
|
102
|
+
return cmd_cls
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_base_dest_cmd():
|
|
106
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
107
|
+
|
|
108
|
+
cmd_cls = BaseDestCmd(
|
|
109
|
+
cmd_name="opensearch",
|
|
110
|
+
cli_config=OpenSearchCliConfig,
|
|
111
|
+
additional_cli_options=[ElasticsearchCliWriteConfig],
|
|
112
|
+
addition_configs={
|
|
113
|
+
"connector_config": SimpleOpenSearchConfig,
|
|
114
|
+
"write_config": ElasticsearchCliWriteConfig,
|
|
115
|
+
},
|
|
116
|
+
)
|
|
117
|
+
return cmd_cls
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
CliRecursiveConfig,
|
|
10
|
+
DelimitedString,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.connector.outlook import SimpleOutlookConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class OutlookCliConfig(SimpleOutlookConfig, CliConfig):
|
|
17
|
+
@staticmethod
|
|
18
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
19
|
+
options = [
|
|
20
|
+
click.Option(
|
|
21
|
+
["--client-id"],
|
|
22
|
+
required=True,
|
|
23
|
+
type=str,
|
|
24
|
+
help="Microsoft app client ID",
|
|
25
|
+
),
|
|
26
|
+
click.Option(
|
|
27
|
+
["--user-email"],
|
|
28
|
+
required=True,
|
|
29
|
+
type=str,
|
|
30
|
+
help="Outlook email to download messages from.",
|
|
31
|
+
),
|
|
32
|
+
click.Option(
|
|
33
|
+
["--tenant"],
|
|
34
|
+
default="common",
|
|
35
|
+
help="ID or domain name associated with your Azure AD instance",
|
|
36
|
+
),
|
|
37
|
+
click.Option(
|
|
38
|
+
["--outlook-folders"],
|
|
39
|
+
default=None,
|
|
40
|
+
type=DelimitedString(),
|
|
41
|
+
help="Folders to download email messages from. "
|
|
42
|
+
"Do not specify subfolders. Use quotes if spaces in folder names.",
|
|
43
|
+
),
|
|
44
|
+
click.Option(
|
|
45
|
+
["--client-cred"],
|
|
46
|
+
default=None,
|
|
47
|
+
type=str,
|
|
48
|
+
help="Microsoft App client secret",
|
|
49
|
+
),
|
|
50
|
+
click.Option(
|
|
51
|
+
["--authority-url"],
|
|
52
|
+
default="https://login.microsoftonline.com",
|
|
53
|
+
type=str,
|
|
54
|
+
help="Authentication token provider for Microsoft apps, default is "
|
|
55
|
+
"https://login.microsoftonline.com",
|
|
56
|
+
),
|
|
57
|
+
]
|
|
58
|
+
return options
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
62
|
+
cmd_cls = BaseSrcCmd(
|
|
63
|
+
cmd_name="outlook",
|
|
64
|
+
cli_config=OutlookCliConfig,
|
|
65
|
+
additional_cli_options=[CliRecursiveConfig],
|
|
66
|
+
)
|
|
67
|
+
return cmd_cls
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.interfaces import (
|
|
7
|
+
CliConfig,
|
|
8
|
+
)
|
|
9
|
+
from unstructured_ingest.connector.pinecone import PineconeWriteConfig, SimplePineconeConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class PineconeCliConfig(SimplePineconeConfig, CliConfig):
|
|
14
|
+
@staticmethod
|
|
15
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
16
|
+
options = [
|
|
17
|
+
click.Option(
|
|
18
|
+
["--api-key"],
|
|
19
|
+
required=True,
|
|
20
|
+
type=str,
|
|
21
|
+
help="API key used for authenticating to a Pinecone instance.",
|
|
22
|
+
envvar="PINECONE_API_KEY",
|
|
23
|
+
show_envvar=True,
|
|
24
|
+
),
|
|
25
|
+
click.Option(
|
|
26
|
+
["--index-name"],
|
|
27
|
+
required=True,
|
|
28
|
+
type=str,
|
|
29
|
+
help="The name of the pinecone index to connect to.",
|
|
30
|
+
),
|
|
31
|
+
click.Option(
|
|
32
|
+
["--environment"],
|
|
33
|
+
required=True,
|
|
34
|
+
type=str,
|
|
35
|
+
help="The environment where the index lives. Eg. 'gcp-starter' or 'us-east1-gcp'",
|
|
36
|
+
),
|
|
37
|
+
]
|
|
38
|
+
return options
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class PineconeCliWriteConfig(PineconeWriteConfig, CliConfig):
|
|
43
|
+
@staticmethod
|
|
44
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
45
|
+
options = [
|
|
46
|
+
click.Option(
|
|
47
|
+
["--batch-size"],
|
|
48
|
+
default=50,
|
|
49
|
+
type=int,
|
|
50
|
+
help="Number of records per batch",
|
|
51
|
+
),
|
|
52
|
+
click.Option(
|
|
53
|
+
["--num-processes"],
|
|
54
|
+
default=2,
|
|
55
|
+
type=int,
|
|
56
|
+
help="Number of parallel processes with which to upload elements",
|
|
57
|
+
),
|
|
58
|
+
]
|
|
59
|
+
return options
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_base_dest_cmd():
|
|
63
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
64
|
+
|
|
65
|
+
cmd_cls = BaseDestCmd(
|
|
66
|
+
cmd_name="pinecone",
|
|
67
|
+
cli_config=PineconeCliConfig,
|
|
68
|
+
additional_cli_options=[PineconeCliWriteConfig],
|
|
69
|
+
write_config=PineconeWriteConfig,
|
|
70
|
+
)
|
|
71
|
+
return cmd_cls
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.interfaces import (
|
|
7
|
+
CliConfig,
|
|
8
|
+
)
|
|
9
|
+
from unstructured_ingest.connector.qdrant import QdrantWriteConfig, SimpleQdrantConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class QdrantCliConfig(SimpleQdrantConfig, CliConfig):
|
|
14
|
+
@staticmethod
|
|
15
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
16
|
+
options = [
|
|
17
|
+
click.Option(
|
|
18
|
+
["--collection-name"],
|
|
19
|
+
required=True,
|
|
20
|
+
type=str,
|
|
21
|
+
help="The name of the Qdrant collection to use.",
|
|
22
|
+
),
|
|
23
|
+
click.Option(
|
|
24
|
+
["--location"],
|
|
25
|
+
type=str,
|
|
26
|
+
help="The location of the Qdrant cluster.",
|
|
27
|
+
),
|
|
28
|
+
click.Option(
|
|
29
|
+
["--url"],
|
|
30
|
+
type=str,
|
|
31
|
+
help="The location of the Qdrant cluster.",
|
|
32
|
+
),
|
|
33
|
+
click.Option(
|
|
34
|
+
["--port"],
|
|
35
|
+
type=int,
|
|
36
|
+
default=6333,
|
|
37
|
+
help="Port of the REST API interface. Default: 6333.",
|
|
38
|
+
),
|
|
39
|
+
click.Option(
|
|
40
|
+
["--grpc-port"],
|
|
41
|
+
type=int,
|
|
42
|
+
default=6334,
|
|
43
|
+
help="Port of the gRPC interface. Default: 6334.",
|
|
44
|
+
),
|
|
45
|
+
click.Option(
|
|
46
|
+
["--prefer-grpc"],
|
|
47
|
+
type=bool,
|
|
48
|
+
is_flag=True,
|
|
49
|
+
help="Whether to use gPRC interface whenever possible in methods. Default: False.",
|
|
50
|
+
),
|
|
51
|
+
click.Option(
|
|
52
|
+
["--https"],
|
|
53
|
+
type=bool,
|
|
54
|
+
is_flag=True,
|
|
55
|
+
help="Whether to use HTTPS(SSL) protocol. Default: False.",
|
|
56
|
+
),
|
|
57
|
+
click.Option(
|
|
58
|
+
["--prefix"],
|
|
59
|
+
type=str,
|
|
60
|
+
help="Prefix to add the REST API endpoints.",
|
|
61
|
+
),
|
|
62
|
+
click.Option(
|
|
63
|
+
["--timeout"],
|
|
64
|
+
type=int,
|
|
65
|
+
help="Timeout for operations. Default: 5.0 seconds for REST, unlimited for gRPC.",
|
|
66
|
+
),
|
|
67
|
+
click.Option(
|
|
68
|
+
["--host"],
|
|
69
|
+
type=str,
|
|
70
|
+
help="Host name of the Qdrant service.",
|
|
71
|
+
),
|
|
72
|
+
click.Option(
|
|
73
|
+
["--path"],
|
|
74
|
+
type=str,
|
|
75
|
+
help="Persistence path for QdrantLocal.",
|
|
76
|
+
),
|
|
77
|
+
click.Option(
|
|
78
|
+
["--force-disable-check-same-thread"],
|
|
79
|
+
type=bool,
|
|
80
|
+
is_flag=True,
|
|
81
|
+
help="Whether to force disable check same thread for QdrantLocal.",
|
|
82
|
+
),
|
|
83
|
+
click.Option(
|
|
84
|
+
["--api-key"],
|
|
85
|
+
type=str,
|
|
86
|
+
help="API key for authentication in Qdrant Cloud. Default: None.",
|
|
87
|
+
envvar="QDRANT_API_KEY",
|
|
88
|
+
show_envvar=True,
|
|
89
|
+
),
|
|
90
|
+
]
|
|
91
|
+
return options
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class QdrantCliWriteConfig(QdrantWriteConfig, CliConfig):
|
|
96
|
+
@staticmethod
|
|
97
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
98
|
+
options = [
|
|
99
|
+
click.Option(
|
|
100
|
+
["--batch-size"],
|
|
101
|
+
default=50,
|
|
102
|
+
type=int,
|
|
103
|
+
help="Number of points to upload per batch",
|
|
104
|
+
),
|
|
105
|
+
click.Option(
|
|
106
|
+
["--num-processes"],
|
|
107
|
+
default=2,
|
|
108
|
+
type=int,
|
|
109
|
+
help="Number of parallel processes with which to upload",
|
|
110
|
+
),
|
|
111
|
+
]
|
|
112
|
+
return options
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_base_dest_cmd():
|
|
116
|
+
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
117
|
+
|
|
118
|
+
cmd_cls = BaseDestCmd(
|
|
119
|
+
cmd_name="qdrant",
|
|
120
|
+
cli_config=QdrantCliConfig,
|
|
121
|
+
additional_cli_options=[QdrantCliWriteConfig],
|
|
122
|
+
write_config=QdrantWriteConfig,
|
|
123
|
+
)
|
|
124
|
+
return cmd_cls
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
7
|
+
from unstructured_ingest.cli.interfaces import (
|
|
8
|
+
CliConfig,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.connector.reddit import SimpleRedditConfig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class RedditCliConfig(SimpleRedditConfig, CliConfig):
|
|
15
|
+
@staticmethod
|
|
16
|
+
def get_cli_options() -> t.List[click.Option]:
|
|
17
|
+
options = [
|
|
18
|
+
click.Option(
|
|
19
|
+
["--client-id"],
|
|
20
|
+
required=True,
|
|
21
|
+
type=str,
|
|
22
|
+
help="The client ID, see "
|
|
23
|
+
"https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#prerequisites" # noqa: E501
|
|
24
|
+
" for more information.",
|
|
25
|
+
),
|
|
26
|
+
click.Option(
|
|
27
|
+
["--client-secret"],
|
|
28
|
+
required=True,
|
|
29
|
+
type=str,
|
|
30
|
+
help="The client secret, see "
|
|
31
|
+
"https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#prerequisites" # noqa: E501
|
|
32
|
+
" for more information.",
|
|
33
|
+
),
|
|
34
|
+
click.Option(
|
|
35
|
+
["--subreddit-name"],
|
|
36
|
+
required=True,
|
|
37
|
+
type=str,
|
|
38
|
+
help='The name of a subreddit, without the "r\\", e.g. "machinelearning"',
|
|
39
|
+
),
|
|
40
|
+
click.Option(
|
|
41
|
+
["--search-query"],
|
|
42
|
+
default=None,
|
|
43
|
+
type=str,
|
|
44
|
+
help="If set, return posts using this query. Otherwise, use hot posts.",
|
|
45
|
+
),
|
|
46
|
+
click.Option(
|
|
47
|
+
["--num-posts"],
|
|
48
|
+
required=True,
|
|
49
|
+
type=click.IntRange(0),
|
|
50
|
+
help="If set, limits the number of posts to pull in.",
|
|
51
|
+
),
|
|
52
|
+
click.Option(
|
|
53
|
+
["--user-agent"],
|
|
54
|
+
required=True,
|
|
55
|
+
type=str,
|
|
56
|
+
help="user agent request header to use when calling Reddit API",
|
|
57
|
+
),
|
|
58
|
+
]
|
|
59
|
+
return options
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_base_src_cmd() -> BaseSrcCmd:
|
|
63
|
+
cmd_cls = BaseSrcCmd(
|
|
64
|
+
cmd_name="reddit",
|
|
65
|
+
cli_config=RedditCliConfig,
|
|
66
|
+
)
|
|
67
|
+
return cmd_cls
|