unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +49 -0
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/processes/connectors/github.py +221 -0
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
- unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,19 +1,269 @@
|
|
|
1
|
-
import
|
|
2
|
-
from abc import ABC
|
|
3
|
-
from
|
|
1
|
+
import inspect
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from dataclasses import dataclass, field, fields
|
|
5
|
+
from typing import Any, Optional, Type, TypeVar
|
|
4
6
|
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
+
import click
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.cli.base.importer import import_from_string
|
|
11
|
+
from unstructured_ingest.cli.utils.click import extract_config
|
|
12
|
+
from unstructured_ingest.cli.utils.model_conversion import options_from_base_model, post_check
|
|
13
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
16
|
+
from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
|
|
17
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
18
|
+
DownloaderT,
|
|
19
|
+
IndexerT,
|
|
20
|
+
RegistryEntry,
|
|
21
|
+
UploaderT,
|
|
22
|
+
UploadStager,
|
|
23
|
+
UploadStagerConfig,
|
|
24
|
+
UploadStagerT,
|
|
25
|
+
destination_registry,
|
|
26
|
+
source_registry,
|
|
27
|
+
)
|
|
28
|
+
from unstructured_ingest.processes.connectors.local import LocalUploader, LocalUploaderConfig
|
|
29
|
+
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
30
|
+
from unstructured_ingest.processes.filter import Filterer, FiltererConfig
|
|
31
|
+
from unstructured_ingest.processes.partitioner import Partitioner, PartitionerConfig
|
|
32
|
+
|
|
33
|
+
CommandT = TypeVar("CommandT", bound=click.Command)
|
|
7
34
|
|
|
8
35
|
|
|
9
36
|
@dataclass
|
|
10
37
|
class BaseCmd(ABC):
|
|
11
38
|
cmd_name: str
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
39
|
+
registry_entry: RegistryEntry
|
|
40
|
+
default_configs: list[Type[BaseModel]] = field(default_factory=list)
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def get_registry_options(self):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def get_default_options(self) -> list[click.Option]:
|
|
47
|
+
options = []
|
|
48
|
+
for extra in self.default_configs:
|
|
49
|
+
options.extend(options_from_base_model(model=extra))
|
|
50
|
+
return options
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def consolidate_options(cls, options: list[click.Option]) -> list[click.Option]:
|
|
54
|
+
option_names = [option.name for option in options]
|
|
55
|
+
duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
|
|
56
|
+
if not duplicate_names:
|
|
57
|
+
return options
|
|
58
|
+
consolidated_options = []
|
|
59
|
+
current_names = []
|
|
60
|
+
for option in options:
|
|
61
|
+
if option.name not in current_names:
|
|
62
|
+
current_names.append(option.name)
|
|
63
|
+
consolidated_options.append(option)
|
|
64
|
+
continue
|
|
65
|
+
existing_option = next(o for o in consolidated_options if o.name == option.name)
|
|
66
|
+
if existing_option.__dict__ == option.__dict__:
|
|
67
|
+
continue
|
|
68
|
+
option_diff = cls.get_options_diff(o1=option, o2=existing_option)
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"Conflicting duplicate {} option defined: {}".format(
|
|
71
|
+
option.name, " | ".join([f"{d[0]}: {d[1]}" for d in option_diff])
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
return consolidated_options
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def get_options_diff(o1: click.Option, o2: click.Option):
|
|
78
|
+
o1_dict = o1.__dict__
|
|
79
|
+
o2_dict = o2.__dict__
|
|
80
|
+
for d in [o1_dict, o2_dict]:
|
|
81
|
+
d["opts"] = ",".join(d["opts"])
|
|
82
|
+
d["secondary_opts"] = ",".join(d["secondary_opts"])
|
|
83
|
+
option_diff = set(o1_dict.items()) ^ set(o2_dict.items())
|
|
84
|
+
return option_diff
|
|
16
85
|
|
|
17
86
|
@property
|
|
18
87
|
def cmd_name_key(self):
|
|
19
88
|
return self.cmd_name.replace("-", "_")
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def cli_cmd_name(self):
|
|
92
|
+
return self.cmd_name.replace("_", "-")
|
|
93
|
+
|
|
94
|
+
@abstractmethod
|
|
95
|
+
def cmd(self, ctx: click.Context, **options) -> None:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
def add_options(self, cmd: CommandT) -> CommandT:
|
|
99
|
+
options = self.get_registry_options()
|
|
100
|
+
options.extend(self.get_default_options())
|
|
101
|
+
post_check(options=options, name=cmd.name)
|
|
102
|
+
cmd.params.extend(options)
|
|
103
|
+
return cmd
|
|
104
|
+
|
|
105
|
+
def get_pipeline(
|
|
106
|
+
self,
|
|
107
|
+
src: str,
|
|
108
|
+
source_options: dict[str, Any],
|
|
109
|
+
dest: Optional[str] = None,
|
|
110
|
+
destination_options: Optional[dict[str, Any]] = None,
|
|
111
|
+
) -> Pipeline:
|
|
112
|
+
logger.debug(
|
|
113
|
+
f"creating pipeline from cli using source {src} with options: {source_options}"
|
|
114
|
+
)
|
|
115
|
+
pipeline_kwargs: dict[str, Any] = {
|
|
116
|
+
"context": self.get_processor_config(options=source_options),
|
|
117
|
+
"downloader": self.get_downloader(src=src, options=source_options),
|
|
118
|
+
"indexer": self.get_indexer(src=src, options=source_options),
|
|
119
|
+
"partitioner": self.get_partitioner(options=source_options),
|
|
120
|
+
}
|
|
121
|
+
if chunker := self.get_chunker(options=source_options):
|
|
122
|
+
pipeline_kwargs["chunker"] = chunker
|
|
123
|
+
if filterer := self.get_filterer(options=source_options):
|
|
124
|
+
pipeline_kwargs["filterer"] = filterer
|
|
125
|
+
if embedder := self.get_embedder(options=source_options):
|
|
126
|
+
pipeline_kwargs["embedder"] = embedder
|
|
127
|
+
if dest:
|
|
128
|
+
logger.debug(
|
|
129
|
+
f"setting destination on pipeline {dest} with options: {destination_options}"
|
|
130
|
+
)
|
|
131
|
+
if uploader_stager := self.get_upload_stager(dest=dest, options=destination_options):
|
|
132
|
+
pipeline_kwargs["stager"] = uploader_stager
|
|
133
|
+
pipeline_kwargs["uploader"] = self.get_uploader(dest=dest, options=destination_options)
|
|
134
|
+
else:
|
|
135
|
+
# Default to local uploader
|
|
136
|
+
# TODO remove after v1 no longer supported
|
|
137
|
+
destination_options = destination_options or {}
|
|
138
|
+
if "output_dir" not in destination_options:
|
|
139
|
+
destination_options["output_dir"] = source_options["output_dir"]
|
|
140
|
+
pipeline_kwargs["uploader"] = self.get_default_uploader(options=destination_options)
|
|
141
|
+
return Pipeline(**pipeline_kwargs)
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def get_default_uploader(options: dict[str, Any]) -> UploaderT:
|
|
145
|
+
uploader_config = extract_config(flat_data=options, config=LocalUploaderConfig)
|
|
146
|
+
return LocalUploader(upload_config=uploader_config)
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def get_chunker(options: dict[str, Any]) -> Optional[Chunker]:
|
|
150
|
+
chunker_config = extract_config(flat_data=options, config=ChunkerConfig)
|
|
151
|
+
if not chunker_config.chunking_strategy:
|
|
152
|
+
return None
|
|
153
|
+
return Chunker(config=chunker_config)
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
|
|
157
|
+
filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
|
|
158
|
+
if not filterer_configs.model_dump():
|
|
159
|
+
return None
|
|
160
|
+
return Filterer(config=filterer_configs)
|
|
161
|
+
|
|
162
|
+
@staticmethod
|
|
163
|
+
def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
|
|
164
|
+
embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
|
|
165
|
+
if not embedder_config.embedding_provider:
|
|
166
|
+
return None
|
|
167
|
+
return Embedder(config=embedder_config)
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def get_partitioner(options: dict[str, Any]) -> Partitioner:
|
|
171
|
+
partitioner_config = extract_config(flat_data=options, config=PartitionerConfig)
|
|
172
|
+
return Partitioner(config=partitioner_config)
|
|
173
|
+
|
|
174
|
+
@staticmethod
|
|
175
|
+
def get_processor_config(options: dict[str, Any]) -> ProcessorConfig:
|
|
176
|
+
return extract_config(flat_data=options, config=ProcessorConfig)
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def get_indexer(src: str, options: dict[str, Any]) -> IndexerT:
|
|
180
|
+
source_entry = source_registry[src]
|
|
181
|
+
indexer_kwargs: dict[str, Any] = {}
|
|
182
|
+
if indexer_config_cls := source_entry.indexer_config:
|
|
183
|
+
indexer_kwargs["index_config"] = extract_config(
|
|
184
|
+
flat_data=options, config=indexer_config_cls
|
|
185
|
+
)
|
|
186
|
+
if connection_config_cls := source_entry.connection_config:
|
|
187
|
+
indexer_kwargs["connection_config"] = extract_config(
|
|
188
|
+
flat_data=options, config=connection_config_cls
|
|
189
|
+
)
|
|
190
|
+
indexer_cls = source_entry.indexer
|
|
191
|
+
return indexer_cls(**indexer_kwargs)
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def get_downloader(src: str, options: dict[str, Any]) -> DownloaderT:
|
|
195
|
+
source_entry = source_registry[src]
|
|
196
|
+
downloader_kwargs: dict[str, Any] = {}
|
|
197
|
+
if downloader_config_cls := source_entry.downloader_config:
|
|
198
|
+
downloader_kwargs["download_config"] = extract_config(
|
|
199
|
+
flat_data=options, config=downloader_config_cls
|
|
200
|
+
)
|
|
201
|
+
if connection_config_cls := source_entry.connection_config:
|
|
202
|
+
downloader_kwargs["connection_config"] = extract_config(
|
|
203
|
+
flat_data=options, config=connection_config_cls
|
|
204
|
+
)
|
|
205
|
+
downloader_cls = source_entry.downloader
|
|
206
|
+
return downloader_cls(**downloader_kwargs)
|
|
207
|
+
|
|
208
|
+
@staticmethod
|
|
209
|
+
def get_custom_stager(
|
|
210
|
+
stager_reference: str, stager_config_kwargs: Optional[dict] = None
|
|
211
|
+
) -> Optional[UploadStagerT]:
|
|
212
|
+
uploader_cls = import_from_string(stager_reference)
|
|
213
|
+
if not inspect.isclass(uploader_cls):
|
|
214
|
+
raise ValueError(
|
|
215
|
+
f"custom stager must be a reference to a python class, got: {type(uploader_cls)}"
|
|
216
|
+
)
|
|
217
|
+
if not issubclass(uploader_cls, UploadStager):
|
|
218
|
+
raise ValueError(
|
|
219
|
+
"custom stager must be an implementation of the UploadStager interface"
|
|
220
|
+
)
|
|
221
|
+
fields_dict = {f.name: f.type for f in fields(uploader_cls)}
|
|
222
|
+
upload_stager_config_cls = fields_dict["upload_stager_config"]
|
|
223
|
+
if not inspect.isclass(upload_stager_config_cls):
|
|
224
|
+
raise ValueError(
|
|
225
|
+
f"custom stager config must be a class, got: {type(upload_stager_config_cls)}"
|
|
226
|
+
)
|
|
227
|
+
if not issubclass(upload_stager_config_cls, UploadStagerConfig):
|
|
228
|
+
raise ValueError(
|
|
229
|
+
"custom stager config must be an implementation "
|
|
230
|
+
"of the UploadStagerUploadStagerConfig interface"
|
|
231
|
+
)
|
|
232
|
+
upload_stager_kwargs: dict[str, Any] = {}
|
|
233
|
+
if stager_config_kwargs:
|
|
234
|
+
upload_stager_kwargs["upload_stager_config"] = upload_stager_config_cls(
|
|
235
|
+
**stager_config_kwargs
|
|
236
|
+
)
|
|
237
|
+
return uploader_cls(**upload_stager_kwargs)
|
|
238
|
+
|
|
239
|
+
@staticmethod
|
|
240
|
+
def get_upload_stager(dest: str, options: dict[str, Any]) -> Optional[UploadStagerT]:
|
|
241
|
+
if custom_stager := options.get("custom_stager"):
|
|
242
|
+
return BaseCmd.get_custom_stager(
|
|
243
|
+
stager_reference=custom_stager,
|
|
244
|
+
stager_config_kwargs=options.get("custom_stager_config_kwargs"),
|
|
245
|
+
)
|
|
246
|
+
dest_entry = destination_registry[dest]
|
|
247
|
+
upload_stager_kwargs: dict[str, Any] = {}
|
|
248
|
+
if upload_stager_config_cls := dest_entry.upload_stager_config:
|
|
249
|
+
upload_stager_kwargs["upload_stager_config"] = extract_config(
|
|
250
|
+
flat_data=options, config=upload_stager_config_cls
|
|
251
|
+
)
|
|
252
|
+
if upload_stager_cls := dest_entry.upload_stager:
|
|
253
|
+
return upload_stager_cls(**upload_stager_kwargs)
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def get_uploader(dest, options: dict[str, Any]) -> UploaderT:
|
|
258
|
+
dest_entry = destination_registry[dest]
|
|
259
|
+
uploader_kwargs: dict[str, Any] = {}
|
|
260
|
+
if uploader_config_cls := dest_entry.uploader_config:
|
|
261
|
+
uploader_kwargs["upload_config"] = extract_config(
|
|
262
|
+
flat_data=options, config=uploader_config_cls
|
|
263
|
+
)
|
|
264
|
+
if connection_config_cls := dest_entry.connection_config:
|
|
265
|
+
uploader_kwargs["connection_config"] = extract_config(
|
|
266
|
+
flat_data=options, config=connection_config_cls
|
|
267
|
+
)
|
|
268
|
+
uploader_cls = dest_entry.uploader
|
|
269
|
+
return uploader_cls(**uploader_kwargs)
|
|
@@ -1,87 +1,84 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import typing as t
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
|
|
5
4
|
import click
|
|
6
5
|
|
|
7
6
|
from unstructured_ingest.cli.base.cmd import BaseCmd
|
|
8
|
-
from unstructured_ingest.cli.
|
|
9
|
-
from unstructured_ingest.cli.
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
from unstructured_ingest.cli.interfaces import BaseConfig, CliFilesStorageConfig
|
|
13
|
-
from unstructured_ingest.cli.utils import (
|
|
14
|
-
add_options,
|
|
15
|
-
conform_click_options,
|
|
16
|
-
extract_config,
|
|
17
|
-
extract_configs,
|
|
18
|
-
)
|
|
19
|
-
from unstructured_ingest.logger import ingest_log_streaming_init, logger
|
|
20
|
-
from unstructured_ingest.runner.writers import writer_map
|
|
7
|
+
from unstructured_ingest.cli.utils.click import Dict, conform_click_options
|
|
8
|
+
from unstructured_ingest.cli.utils.model_conversion import options_from_base_model
|
|
9
|
+
from unstructured_ingest.logger import logger
|
|
10
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
21
11
|
|
|
22
12
|
|
|
23
13
|
@dataclass
|
|
24
|
-
class
|
|
25
|
-
|
|
14
|
+
class DestCmd(BaseCmd):
|
|
15
|
+
registry_entry: DestinationRegistryEntry
|
|
26
16
|
|
|
27
|
-
def
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
writer_cls = writer_map[self.cmd_name_key]
|
|
43
|
-
writer = writer_cls(**configs) # type: ignore
|
|
44
|
-
runner.writer = writer
|
|
45
|
-
runner.writer_kwargs = options
|
|
46
|
-
return runner
|
|
47
|
-
|
|
48
|
-
def check_dest_options(self, options: dict):
|
|
49
|
-
extract_config(flat_data=options, config=self.cli_config)
|
|
17
|
+
def get_registry_options(self):
|
|
18
|
+
options = []
|
|
19
|
+
configs = [
|
|
20
|
+
config
|
|
21
|
+
for config in [
|
|
22
|
+
self.registry_entry.uploader_config,
|
|
23
|
+
self.registry_entry.upload_stager_config,
|
|
24
|
+
self.registry_entry.connection_config,
|
|
25
|
+
]
|
|
26
|
+
if config
|
|
27
|
+
]
|
|
28
|
+
for config in configs:
|
|
29
|
+
options.extend(options_from_base_model(model=config))
|
|
30
|
+
options = self.consolidate_options(options=options)
|
|
31
|
+
return options
|
|
50
32
|
|
|
51
|
-
def
|
|
33
|
+
def cmd(self, ctx: click.Context, **options) -> None:
|
|
34
|
+
logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
|
|
52
35
|
if not ctx.parent:
|
|
53
36
|
raise click.ClickException("destination command called without a parent")
|
|
54
37
|
if not ctx.parent.info_name:
|
|
55
38
|
raise click.ClickException("parent command missing info name")
|
|
56
39
|
source_cmd = ctx.parent.info_name.replace("-", "_")
|
|
57
|
-
|
|
40
|
+
source_options: dict = ctx.parent.params if ctx.parent else {}
|
|
58
41
|
conform_click_options(options)
|
|
59
|
-
verbose = parent_options.get("verbose", False)
|
|
60
|
-
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
|
61
|
-
log_options(parent_options, verbose=verbose)
|
|
62
|
-
log_options(options, verbose=verbose)
|
|
63
42
|
try:
|
|
64
|
-
self.
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
43
|
+
pipeline = self.get_pipeline(
|
|
44
|
+
src=source_cmd,
|
|
45
|
+
source_options=source_options,
|
|
46
|
+
dest=self.cmd_name,
|
|
47
|
+
destination_options=options,
|
|
69
48
|
)
|
|
70
|
-
|
|
49
|
+
pipeline.run()
|
|
71
50
|
except Exception as e:
|
|
72
|
-
logger.error(e, exc_info=True)
|
|
51
|
+
logger.error(f"failed to run destination command {self.cmd_name}: {e}", exc_info=True)
|
|
73
52
|
raise click.ClickException(str(e)) from e
|
|
74
53
|
|
|
75
|
-
def
|
|
54
|
+
def get_cmd(self) -> click.Command:
|
|
76
55
|
# Dynamically create the command without the use of click decorators
|
|
77
|
-
fn = self.
|
|
56
|
+
fn = self.cmd
|
|
78
57
|
fn = click.pass_context(fn)
|
|
79
|
-
cmd
|
|
80
|
-
cmd
|
|
58
|
+
cmd = click.command(fn)
|
|
59
|
+
if not isinstance(cmd, click.core.Command):
|
|
60
|
+
raise ValueError(f"generated command was not of expected type Command: {type(cmd)}")
|
|
61
|
+
cmd.name = self.cli_cmd_name
|
|
81
62
|
cmd.invoke_without_command = True
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
63
|
+
self.add_options(cmd)
|
|
64
|
+
cmd.params.append(
|
|
65
|
+
click.Option(
|
|
66
|
+
["--custom-stager"],
|
|
67
|
+
required=False,
|
|
68
|
+
type=str,
|
|
69
|
+
default=None,
|
|
70
|
+
help="Pass a pointer to a custom upload stager to use, "
|
|
71
|
+
"must be in format '<module>:<attribute>'",
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
cmd.params.append(
|
|
75
|
+
click.Option(
|
|
76
|
+
["--custom-stager-config-kwargs"],
|
|
77
|
+
required=False,
|
|
78
|
+
type=Dict(),
|
|
79
|
+
default=None,
|
|
80
|
+
help="Any kwargs to instantiate the configuration "
|
|
81
|
+
"associated with the customer stager",
|
|
82
|
+
)
|
|
83
|
+
)
|
|
87
84
|
return cmd
|
|
@@ -1,57 +1,75 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any
|
|
3
4
|
|
|
4
5
|
import click
|
|
6
|
+
from pydantic import BaseModel
|
|
5
7
|
|
|
6
8
|
from unstructured_ingest.cli.base.cmd import BaseCmd
|
|
7
|
-
from unstructured_ingest.cli.
|
|
8
|
-
|
|
9
|
+
from unstructured_ingest.cli.utils.click import Group, conform_click_options
|
|
10
|
+
from unstructured_ingest.cli.utils.model_conversion import options_from_base_model
|
|
11
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
12
|
+
from unstructured_ingest.logger import logger
|
|
13
|
+
from unstructured_ingest.processes import (
|
|
14
|
+
ChunkerConfig,
|
|
15
|
+
EmbedderConfig,
|
|
16
|
+
FiltererConfig,
|
|
17
|
+
PartitionerConfig,
|
|
9
18
|
)
|
|
10
|
-
from unstructured_ingest.
|
|
11
|
-
from unstructured_ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
|
12
|
-
from unstructured_ingest.logger import ingest_log_streaming_init, logger
|
|
13
|
-
from unstructured_ingest.runner import runner_map
|
|
19
|
+
from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
|
|
14
20
|
|
|
15
21
|
|
|
16
22
|
@dataclass
|
|
17
|
-
class
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
23
|
+
class SrcCmd(BaseCmd):
|
|
24
|
+
registry_entry: SourceRegistryEntry
|
|
25
|
+
default_configs: list[BaseModel] = field(
|
|
26
|
+
default_factory=lambda: [
|
|
27
|
+
ProcessorConfig,
|
|
28
|
+
PartitionerConfig,
|
|
29
|
+
EmbedderConfig,
|
|
30
|
+
FiltererConfig,
|
|
31
|
+
ChunkerConfig,
|
|
32
|
+
]
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def get_registry_options(self):
|
|
36
|
+
options = []
|
|
37
|
+
configs = [
|
|
38
|
+
config
|
|
39
|
+
for config in [
|
|
40
|
+
self.registry_entry.connection_config,
|
|
41
|
+
self.registry_entry.indexer_config,
|
|
42
|
+
self.registry_entry.downloader_config,
|
|
43
|
+
]
|
|
44
|
+
if config
|
|
45
|
+
]
|
|
46
|
+
for config in configs:
|
|
47
|
+
options.extend(options_from_base_model(model=config))
|
|
48
|
+
options = self.consolidate_options(options=options)
|
|
49
|
+
return options
|
|
50
|
+
|
|
51
|
+
def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
|
|
31
52
|
if ctx.invoked_subcommand:
|
|
32
53
|
return
|
|
33
54
|
|
|
34
55
|
conform_click_options(options)
|
|
35
|
-
|
|
36
|
-
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
|
37
|
-
log_options(options, verbose=verbose)
|
|
56
|
+
logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
|
|
38
57
|
try:
|
|
39
|
-
|
|
40
|
-
|
|
58
|
+
pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
|
|
59
|
+
pipeline.run()
|
|
41
60
|
except Exception as e:
|
|
42
|
-
logger.error(e, exc_info=True)
|
|
61
|
+
logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
|
|
43
62
|
raise click.ClickException(str(e)) from e
|
|
44
63
|
|
|
45
|
-
def
|
|
64
|
+
def get_cmd(self) -> click.Group:
|
|
46
65
|
# Dynamically create the command without the use of click decorators
|
|
47
|
-
fn = self.
|
|
66
|
+
fn = self.cmd
|
|
48
67
|
fn = click.pass_context(fn)
|
|
49
|
-
cmd
|
|
50
|
-
cmd
|
|
68
|
+
cmd = click.group(fn, cls=Group)
|
|
69
|
+
if not isinstance(cmd, click.core.Group):
|
|
70
|
+
raise ValueError(f"generated src command was not of expected type Group: {type(cmd)}")
|
|
71
|
+
cmd.name = self.cli_cmd_name
|
|
51
72
|
cmd.invoke_without_command = True
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
if self.is_fsspec and CliFilesStorageConfig not in extra_options:
|
|
55
|
-
extra_options.append(CliFilesStorageConfig)
|
|
56
|
-
add_options(cmd, extras=extra_options)
|
|
73
|
+
self.add_options(cmd)
|
|
74
|
+
|
|
57
75
|
return cmd
|
unstructured_ingest/cli/cli.py
CHANGED
|
@@ -1,13 +1,6 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING
|
|
2
|
-
|
|
3
1
|
import click
|
|
4
2
|
|
|
5
|
-
from unstructured_ingest.cli import dest, src
|
|
6
|
-
from unstructured_ingest.v2.cli.cmds import dest as dest_v2
|
|
7
|
-
from unstructured_ingest.v2.cli.cmds import src as src_v2
|
|
8
|
-
|
|
9
|
-
if TYPE_CHECKING:
|
|
10
|
-
from click import Command
|
|
3
|
+
from unstructured_ingest.cli.cmds import dest, src
|
|
11
4
|
|
|
12
5
|
|
|
13
6
|
@click.group()
|
|
@@ -15,23 +8,17 @@ def ingest():
|
|
|
15
8
|
pass
|
|
16
9
|
|
|
17
10
|
|
|
18
|
-
def get_cmd() ->
|
|
11
|
+
def get_cmd() -> click.Command:
|
|
19
12
|
"""Construct and return a Click command object representing the main command for the CLI.
|
|
20
13
|
|
|
21
14
|
This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
|
|
22
15
|
to the main command as nested subcommands.
|
|
23
16
|
"""
|
|
24
17
|
cmd = ingest
|
|
25
|
-
src_dict = {s.name: s for s in src}
|
|
26
|
-
dest_dict = {d.name: d for d in dest}
|
|
27
|
-
for s in src_v2:
|
|
28
|
-
src_dict[s.name] = s
|
|
29
|
-
for d in dest_v2:
|
|
30
|
-
dest_dict[d.name] = d
|
|
31
18
|
# Add all subcommands
|
|
32
|
-
for src_subcommand in
|
|
19
|
+
for src_subcommand in src:
|
|
33
20
|
# Add all destination subcommands
|
|
34
|
-
for dest_subcommand in
|
|
21
|
+
for dest_subcommand in dest:
|
|
35
22
|
src_subcommand.add_command(dest_subcommand)
|
|
36
23
|
cmd.add_command(src_subcommand)
|
|
37
24
|
return cmd
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import click
|
|
2
2
|
|
|
3
|
-
from unstructured_ingest.
|
|
4
|
-
from unstructured_ingest.
|
|
3
|
+
from unstructured_ingest.cli.base import DestCmd, SrcCmd
|
|
4
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
5
5
|
destination_registry,
|
|
6
6
|
source_registry,
|
|
7
7
|
)
|
|
@@ -25,7 +25,7 @@ from pydantic.fields import FieldInfo
|
|
|
25
25
|
from pydantic.types import _SecretBase
|
|
26
26
|
from pydantic_core import PydanticUndefined
|
|
27
27
|
|
|
28
|
-
from unstructured_ingest.
|
|
28
|
+
from unstructured_ingest.cli.utils.click import (
|
|
29
29
|
DelimitedString,
|
|
30
30
|
Dict,
|
|
31
31
|
PydanticDate,
|
|
@@ -106,7 +106,7 @@ def get_numerical_type(field: FieldInfo) -> click.ParamType:
|
|
|
106
106
|
if range_args:
|
|
107
107
|
return click.IntRange(**range_args) # type: ignore[arg-type]
|
|
108
108
|
return click.INT
|
|
109
|
-
# Non-integer numerical
|
|
109
|
+
# Non-integer numerical data_types default to float
|
|
110
110
|
if range_args:
|
|
111
111
|
return click.FloatRange(**range_args) # type: ignore[arg-type]
|
|
112
112
|
return click.FLOAT
|
|
@@ -183,13 +183,13 @@ def is_subclass(x: Any, y: Any) -> bool:
|
|
|
183
183
|
return False
|
|
184
184
|
|
|
185
185
|
|
|
186
|
-
def post_check(options: list[Option]):
|
|
186
|
+
def post_check(options: list[Option], name: str):
|
|
187
187
|
option_names = [option.name for option in options]
|
|
188
188
|
duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
|
|
189
189
|
if duplicate_names:
|
|
190
190
|
raise ValueError(
|
|
191
|
-
"the following field name were reused, all must be unique: {}".format(
|
|
192
|
-
", ".join(duplicate_names)
|
|
191
|
+
"[{}] the following field name were reused, all must be unique: {}".format(
|
|
192
|
+
name, ", ".join(duplicate_names)
|
|
193
193
|
)
|
|
194
194
|
)
|
|
195
195
|
|
|
@@ -218,5 +218,5 @@ def options_from_base_model(model: Union[BaseModel, Type[BaseModel]]) -> list[Op
|
|
|
218
218
|
field_info.description = f"[sensitive] {field_info.description}"
|
|
219
219
|
options.append(get_option_from_field(option_name=option_name, field_info=field_info))
|
|
220
220
|
|
|
221
|
-
post_check(options=options)
|
|
221
|
+
post_check(options=options, name=model.__name__)
|
|
222
222
|
return options
|
|
@@ -5,7 +5,7 @@ from uuid import NAMESPACE_DNS, uuid5
|
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
|
7
7
|
|
|
8
|
-
from unstructured_ingest.
|
|
8
|
+
from unstructured_ingest.logger import logger
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class SourceIdentifiers(BaseModel):
|
|
@@ -13,15 +13,15 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
13
13
|
BaseEmbeddingEncoder,
|
|
14
14
|
EmbeddingConfig,
|
|
15
15
|
)
|
|
16
|
-
from unstructured_ingest.
|
|
17
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
18
|
-
from unstructured_ingest.v2.errors import (
|
|
16
|
+
from unstructured_ingest.errors_v2 import (
|
|
19
17
|
ProviderError,
|
|
20
18
|
RateLimitError,
|
|
21
19
|
UserAuthError,
|
|
22
20
|
UserError,
|
|
23
21
|
is_internal_error,
|
|
24
22
|
)
|
|
23
|
+
from unstructured_ingest.logger import logger
|
|
24
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
25
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
27
27
|
from botocore.client import BaseClient
|