unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +1 -1
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
- unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
5
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
6
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
7
|
-
|
|
8
|
-
if t.TYPE_CHECKING:
|
|
9
|
-
from unstructured_ingest.connector.databricks_volumes import (
|
|
10
|
-
DatabricksVolumesWriteConfig,
|
|
11
|
-
SimpleDatabricksVolumesConfig,
|
|
12
|
-
)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataclass
|
|
16
|
-
class DatabricksVolumesWriter(Writer, EnhancedDataClassJsonMixin):
|
|
17
|
-
write_config: "DatabricksVolumesWriteConfig"
|
|
18
|
-
connector_config: "SimpleDatabricksVolumesConfig"
|
|
19
|
-
|
|
20
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
21
|
-
from unstructured_ingest.connector.databricks_volumes import (
|
|
22
|
-
DatabricksVolumesDestinationConnector,
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
return DatabricksVolumesDestinationConnector
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.delta_table import (
|
|
9
|
-
DeltaTableWriteConfig,
|
|
10
|
-
SimpleDeltaTableConfig,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class DeltaTableWriter(Writer):
|
|
16
|
-
write_config: "DeltaTableWriteConfig"
|
|
17
|
-
connector_config: "SimpleDeltaTableConfig"
|
|
18
|
-
|
|
19
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
20
|
-
from unstructured_ingest.connector.delta_table import (
|
|
21
|
-
DeltaTableDestinationConnector,
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
return DeltaTableDestinationConnector
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.elasticsearch import (
|
|
9
|
-
ElasticsearchWriteConfig,
|
|
10
|
-
SimpleElasticsearchConfig,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class ElasticsearchWriter(Writer):
|
|
16
|
-
connector_config: "SimpleElasticsearchConfig"
|
|
17
|
-
write_config: "ElasticsearchWriteConfig"
|
|
18
|
-
|
|
19
|
-
def get_connector_cls(self) -> BaseDestinationConnector:
|
|
20
|
-
from unstructured_ingest.connector.elasticsearch import (
|
|
21
|
-
ElasticsearchDestinationConnector,
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
return ElasticsearchDestinationConnector
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.fsspec.azure import (
|
|
9
|
-
AzureWriteConfig,
|
|
10
|
-
SimpleAzureBlobStorageConfig,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class AzureWriter(Writer):
|
|
16
|
-
connector_config: "SimpleAzureBlobStorageConfig"
|
|
17
|
-
write_config: "AzureWriteConfig"
|
|
18
|
-
|
|
19
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
20
|
-
from unstructured_ingest.connector.fsspec.azure import (
|
|
21
|
-
AzureBlobStorageDestinationConnector,
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
return AzureBlobStorageDestinationConnector
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.fsspec.box import BoxWriteConfig, SimpleBoxConfig
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class BoxWriter(Writer):
|
|
13
|
-
connector_config: "SimpleBoxConfig"
|
|
14
|
-
write_config: "BoxWriteConfig"
|
|
15
|
-
|
|
16
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
-
from unstructured_ingest.connector.fsspec.box import (
|
|
18
|
-
BoxDestinationConnector,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
return BoxDestinationConnector
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.fsspec.dropbox import DropboxWriteConfig, SimpleDropboxConfig
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class DropboxWriter(Writer):
|
|
13
|
-
connector_config: "SimpleDropboxConfig"
|
|
14
|
-
write_config: "DropboxWriteConfig"
|
|
15
|
-
|
|
16
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
-
from unstructured_ingest.connector.fsspec.dropbox import (
|
|
18
|
-
DropboxDestinationConnector,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
return DropboxDestinationConnector
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.fsspec.gcs import GcsWriteConfig, SimpleGcsConfig
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class GcsWriter(Writer):
|
|
13
|
-
connector_config: "SimpleGcsConfig"
|
|
14
|
-
write_config: "GcsWriteConfig"
|
|
15
|
-
|
|
16
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
-
from unstructured_ingest.connector.fsspec.gcs import GcsDestinationConnector
|
|
18
|
-
|
|
19
|
-
return GcsDestinationConnector
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.fsspec.s3 import S3WriteConfig, SimpleS3Config
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class S3Writer(Writer):
|
|
13
|
-
connector_config: "SimpleS3Config"
|
|
14
|
-
write_config: "S3WriteConfig"
|
|
15
|
-
|
|
16
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
-
from unstructured_ingest.connector.fsspec.s3 import (
|
|
18
|
-
S3DestinationConnector,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
return S3DestinationConnector
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.kafka import KafkaWriteConfig, SimpleKafkaConfig
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class KafkaWriter(Writer):
|
|
13
|
-
write_config: "KafkaWriteConfig"
|
|
14
|
-
connector_config: "SimpleKafkaConfig"
|
|
15
|
-
|
|
16
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
-
from unstructured_ingest.connector.kafka import (
|
|
18
|
-
KafkaDestinationConnector,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
return KafkaDestinationConnector
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.mongodb import MongoDBWriteConfig, SimpleMongoDBConfig
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class MongodbWriter(Writer):
|
|
13
|
-
write_config: "MongoDBWriteConfig"
|
|
14
|
-
connector_config: "SimpleMongoDBConfig"
|
|
15
|
-
|
|
16
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
-
from unstructured_ingest.connector.mongodb import (
|
|
18
|
-
MongoDBDestinationConnector,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
return MongoDBDestinationConnector
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.elasticsearch import (
|
|
9
|
-
ElasticsearchWriteConfig,
|
|
10
|
-
)
|
|
11
|
-
from unstructured_ingest.connector.opensearch import (
|
|
12
|
-
SimpleOpenSearchConfig,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@dataclass
|
|
17
|
-
class OpenSearchWriter(Writer):
|
|
18
|
-
connector_config: "SimpleOpenSearchConfig"
|
|
19
|
-
write_config: "ElasticsearchWriteConfig"
|
|
20
|
-
|
|
21
|
-
def get_connector_cls(self) -> BaseDestinationConnector:
|
|
22
|
-
from unstructured_ingest.connector.opensearch import (
|
|
23
|
-
OpenSearchDestinationConnector,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
return OpenSearchDestinationConnector
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.pinecone import PineconeWriteConfig, SimplePineconeConfig
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class PineconeWriter(Writer):
|
|
13
|
-
write_config: "PineconeWriteConfig"
|
|
14
|
-
connector_config: "SimplePineconeConfig"
|
|
15
|
-
|
|
16
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
-
from unstructured_ingest.connector.pinecone import (
|
|
18
|
-
PineconeDestinationConnector,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
return PineconeDestinationConnector
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.qdrant import QdrantWriteConfig, SimpleQdrantConfig
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class QdrantWriter(Writer):
|
|
13
|
-
write_config: "QdrantWriteConfig"
|
|
14
|
-
connector_config: "SimpleQdrantConfig"
|
|
15
|
-
|
|
16
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
-
from unstructured_ingest.connector.qdrant import QdrantDestinationConnector
|
|
18
|
-
|
|
19
|
-
return QdrantDestinationConnector
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.sql import SimpleSqlConfig
|
|
9
|
-
from unstructured_ingest.interfaces import WriteConfig
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@dataclass
|
|
13
|
-
class SqlWriter(Writer):
|
|
14
|
-
write_config: "WriteConfig"
|
|
15
|
-
connector_config: "SimpleSqlConfig"
|
|
16
|
-
|
|
17
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
18
|
-
from unstructured_ingest.connector.sql import (
|
|
19
|
-
SqlDestinationConnector,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
return SqlDestinationConnector
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
5
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
6
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
7
|
-
|
|
8
|
-
if t.TYPE_CHECKING:
|
|
9
|
-
from unstructured_ingest.connector.vectara import SimpleVectaraConfig, VectaraWriteConfig
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@dataclass
|
|
13
|
-
class VectaraWriter(Writer, EnhancedDataClassJsonMixin):
|
|
14
|
-
write_config: "VectaraWriteConfig"
|
|
15
|
-
connector_config: "SimpleVectaraConfig"
|
|
16
|
-
|
|
17
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
18
|
-
from unstructured_ingest.connector.vectara import (
|
|
19
|
-
VectaraDestinationConnector,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
return VectaraDestinationConnector
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
-
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
-
|
|
7
|
-
if t.TYPE_CHECKING:
|
|
8
|
-
from unstructured_ingest.connector.weaviate import SimpleWeaviateConfig, WeaviateWriteConfig
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class WeaviateWriter(Writer):
|
|
13
|
-
write_config: "WeaviateWriteConfig"
|
|
14
|
-
connector_config: "SimpleWeaviateConfig"
|
|
15
|
-
|
|
16
|
-
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
-
from unstructured_ingest.connector.weaviate import (
|
|
18
|
-
WeaviateDestinationConnector,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
return WeaviateDestinationConnector
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
GOOGLE_DRIVE_EXPORT_TYPES = {
|
|
2
|
-
"application/vnd.google-apps.document": "application/"
|
|
3
|
-
"vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
4
|
-
"application/vnd.google-apps.spreadsheet": "application/"
|
|
5
|
-
"vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
6
|
-
"application/vnd.google-apps.presentation": "application/"
|
|
7
|
-
"vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
8
|
-
"application/vnd.google-apps.photo": "image/jpeg",
|
|
9
|
-
}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
File without changes
|
|
@@ -1,269 +0,0 @@
|
|
|
1
|
-
import inspect
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from collections import Counter
|
|
4
|
-
from dataclasses import dataclass, field, fields
|
|
5
|
-
from typing import Any, Optional, Type, TypeVar
|
|
6
|
-
|
|
7
|
-
import click
|
|
8
|
-
from pydantic import BaseModel
|
|
9
|
-
|
|
10
|
-
from unstructured_ingest.v2.cli.base.importer import import_from_string
|
|
11
|
-
from unstructured_ingest.v2.cli.utils.click import extract_config
|
|
12
|
-
from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model, post_check
|
|
13
|
-
from unstructured_ingest.v2.interfaces import ProcessorConfig
|
|
14
|
-
from unstructured_ingest.v2.logger import logger
|
|
15
|
-
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
|
|
16
|
-
from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
|
|
17
|
-
from unstructured_ingest.v2.processes.connector_registry import (
|
|
18
|
-
DownloaderT,
|
|
19
|
-
IndexerT,
|
|
20
|
-
RegistryEntry,
|
|
21
|
-
UploaderT,
|
|
22
|
-
UploadStager,
|
|
23
|
-
UploadStagerConfig,
|
|
24
|
-
UploadStagerT,
|
|
25
|
-
destination_registry,
|
|
26
|
-
source_registry,
|
|
27
|
-
)
|
|
28
|
-
from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
|
|
29
|
-
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
30
|
-
from unstructured_ingest.v2.processes.filter import Filterer, FiltererConfig
|
|
31
|
-
from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
|
|
32
|
-
|
|
33
|
-
CommandT = TypeVar("CommandT", bound=click.Command)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
@dataclass
|
|
37
|
-
class BaseCmd(ABC):
|
|
38
|
-
cmd_name: str
|
|
39
|
-
registry_entry: RegistryEntry
|
|
40
|
-
default_configs: list[Type[BaseModel]] = field(default_factory=list)
|
|
41
|
-
|
|
42
|
-
@abstractmethod
|
|
43
|
-
def get_registry_options(self):
|
|
44
|
-
pass
|
|
45
|
-
|
|
46
|
-
def get_default_options(self) -> list[click.Option]:
|
|
47
|
-
options = []
|
|
48
|
-
for extra in self.default_configs:
|
|
49
|
-
options.extend(options_from_base_model(model=extra))
|
|
50
|
-
return options
|
|
51
|
-
|
|
52
|
-
@classmethod
|
|
53
|
-
def consolidate_options(cls, options: list[click.Option]) -> list[click.Option]:
|
|
54
|
-
option_names = [option.name for option in options]
|
|
55
|
-
duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
|
|
56
|
-
if not duplicate_names:
|
|
57
|
-
return options
|
|
58
|
-
consolidated_options = []
|
|
59
|
-
current_names = []
|
|
60
|
-
for option in options:
|
|
61
|
-
if option.name not in current_names:
|
|
62
|
-
current_names.append(option.name)
|
|
63
|
-
consolidated_options.append(option)
|
|
64
|
-
continue
|
|
65
|
-
existing_option = next(o for o in consolidated_options if o.name == option.name)
|
|
66
|
-
if existing_option.__dict__ == option.__dict__:
|
|
67
|
-
continue
|
|
68
|
-
option_diff = cls.get_options_diff(o1=option, o2=existing_option)
|
|
69
|
-
raise ValueError(
|
|
70
|
-
"Conflicting duplicate {} option defined: {}".format(
|
|
71
|
-
option.name, " | ".join([f"{d[0]}: {d[1]}" for d in option_diff])
|
|
72
|
-
)
|
|
73
|
-
)
|
|
74
|
-
return consolidated_options
|
|
75
|
-
|
|
76
|
-
@staticmethod
|
|
77
|
-
def get_options_diff(o1: click.Option, o2: click.Option):
|
|
78
|
-
o1_dict = o1.__dict__
|
|
79
|
-
o2_dict = o2.__dict__
|
|
80
|
-
for d in [o1_dict, o2_dict]:
|
|
81
|
-
d["opts"] = ",".join(d["opts"])
|
|
82
|
-
d["secondary_opts"] = ",".join(d["secondary_opts"])
|
|
83
|
-
option_diff = set(o1_dict.items()) ^ set(o2_dict.items())
|
|
84
|
-
return option_diff
|
|
85
|
-
|
|
86
|
-
@property
|
|
87
|
-
def cmd_name_key(self):
|
|
88
|
-
return self.cmd_name.replace("-", "_")
|
|
89
|
-
|
|
90
|
-
@property
|
|
91
|
-
def cli_cmd_name(self):
|
|
92
|
-
return self.cmd_name.replace("_", "-")
|
|
93
|
-
|
|
94
|
-
@abstractmethod
|
|
95
|
-
def cmd(self, ctx: click.Context, **options) -> None:
|
|
96
|
-
pass
|
|
97
|
-
|
|
98
|
-
def add_options(self, cmd: CommandT) -> CommandT:
|
|
99
|
-
options = self.get_registry_options()
|
|
100
|
-
options.extend(self.get_default_options())
|
|
101
|
-
post_check(options)
|
|
102
|
-
cmd.params.extend(options)
|
|
103
|
-
return cmd
|
|
104
|
-
|
|
105
|
-
def get_pipeline(
|
|
106
|
-
self,
|
|
107
|
-
src: str,
|
|
108
|
-
source_options: dict[str, Any],
|
|
109
|
-
dest: Optional[str] = None,
|
|
110
|
-
destination_options: Optional[dict[str, Any]] = None,
|
|
111
|
-
) -> Pipeline:
|
|
112
|
-
logger.debug(
|
|
113
|
-
f"creating pipeline from cli using source {src} with options: {source_options}"
|
|
114
|
-
)
|
|
115
|
-
pipeline_kwargs: dict[str, Any] = {
|
|
116
|
-
"context": self.get_processor_config(options=source_options),
|
|
117
|
-
"downloader": self.get_downloader(src=src, options=source_options),
|
|
118
|
-
"indexer": self.get_indexer(src=src, options=source_options),
|
|
119
|
-
"partitioner": self.get_partitioner(options=source_options),
|
|
120
|
-
}
|
|
121
|
-
if chunker := self.get_chunker(options=source_options):
|
|
122
|
-
pipeline_kwargs["chunker"] = chunker
|
|
123
|
-
if filterer := self.get_filterer(options=source_options):
|
|
124
|
-
pipeline_kwargs["filterer"] = filterer
|
|
125
|
-
if embedder := self.get_embedder(options=source_options):
|
|
126
|
-
pipeline_kwargs["embedder"] = embedder
|
|
127
|
-
if dest:
|
|
128
|
-
logger.debug(
|
|
129
|
-
f"setting destination on pipeline {dest} with options: {destination_options}"
|
|
130
|
-
)
|
|
131
|
-
if uploader_stager := self.get_upload_stager(dest=dest, options=destination_options):
|
|
132
|
-
pipeline_kwargs["stager"] = uploader_stager
|
|
133
|
-
pipeline_kwargs["uploader"] = self.get_uploader(dest=dest, options=destination_options)
|
|
134
|
-
else:
|
|
135
|
-
# Default to local uploader
|
|
136
|
-
# TODO remove after v1 no longer supported
|
|
137
|
-
destination_options = destination_options or {}
|
|
138
|
-
if "output_dir" not in destination_options:
|
|
139
|
-
destination_options["output_dir"] = source_options["output_dir"]
|
|
140
|
-
pipeline_kwargs["uploader"] = self.get_default_uploader(options=destination_options)
|
|
141
|
-
return Pipeline(**pipeline_kwargs)
|
|
142
|
-
|
|
143
|
-
@staticmethod
|
|
144
|
-
def get_default_uploader(options: dict[str, Any]) -> UploaderT:
|
|
145
|
-
uploader_config = extract_config(flat_data=options, config=LocalUploaderConfig)
|
|
146
|
-
return LocalUploader(upload_config=uploader_config)
|
|
147
|
-
|
|
148
|
-
@staticmethod
|
|
149
|
-
def get_chunker(options: dict[str, Any]) -> Optional[Chunker]:
|
|
150
|
-
chunker_config = extract_config(flat_data=options, config=ChunkerConfig)
|
|
151
|
-
if not chunker_config.chunking_strategy:
|
|
152
|
-
return None
|
|
153
|
-
return Chunker(config=chunker_config)
|
|
154
|
-
|
|
155
|
-
@staticmethod
|
|
156
|
-
def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
|
|
157
|
-
filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
|
|
158
|
-
if not filterer_configs.model_dump():
|
|
159
|
-
return None
|
|
160
|
-
return Filterer(config=filterer_configs)
|
|
161
|
-
|
|
162
|
-
@staticmethod
|
|
163
|
-
def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
|
|
164
|
-
embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
|
|
165
|
-
if not embedder_config.embedding_provider:
|
|
166
|
-
return None
|
|
167
|
-
return Embedder(config=embedder_config)
|
|
168
|
-
|
|
169
|
-
@staticmethod
|
|
170
|
-
def get_partitioner(options: dict[str, Any]) -> Partitioner:
|
|
171
|
-
partitioner_config = extract_config(flat_data=options, config=PartitionerConfig)
|
|
172
|
-
return Partitioner(config=partitioner_config)
|
|
173
|
-
|
|
174
|
-
@staticmethod
|
|
175
|
-
def get_processor_config(options: dict[str, Any]) -> ProcessorConfig:
|
|
176
|
-
return extract_config(flat_data=options, config=ProcessorConfig)
|
|
177
|
-
|
|
178
|
-
@staticmethod
|
|
179
|
-
def get_indexer(src: str, options: dict[str, Any]) -> IndexerT:
|
|
180
|
-
source_entry = source_registry[src]
|
|
181
|
-
indexer_kwargs: dict[str, Any] = {}
|
|
182
|
-
if indexer_config_cls := source_entry.indexer_config:
|
|
183
|
-
indexer_kwargs["index_config"] = extract_config(
|
|
184
|
-
flat_data=options, config=indexer_config_cls
|
|
185
|
-
)
|
|
186
|
-
if connection_config_cls := source_entry.connection_config:
|
|
187
|
-
indexer_kwargs["connection_config"] = extract_config(
|
|
188
|
-
flat_data=options, config=connection_config_cls
|
|
189
|
-
)
|
|
190
|
-
indexer_cls = source_entry.indexer
|
|
191
|
-
return indexer_cls(**indexer_kwargs)
|
|
192
|
-
|
|
193
|
-
@staticmethod
|
|
194
|
-
def get_downloader(src: str, options: dict[str, Any]) -> DownloaderT:
|
|
195
|
-
source_entry = source_registry[src]
|
|
196
|
-
downloader_kwargs: dict[str, Any] = {}
|
|
197
|
-
if downloader_config_cls := source_entry.downloader_config:
|
|
198
|
-
downloader_kwargs["download_config"] = extract_config(
|
|
199
|
-
flat_data=options, config=downloader_config_cls
|
|
200
|
-
)
|
|
201
|
-
if connection_config_cls := source_entry.connection_config:
|
|
202
|
-
downloader_kwargs["connection_config"] = extract_config(
|
|
203
|
-
flat_data=options, config=connection_config_cls
|
|
204
|
-
)
|
|
205
|
-
downloader_cls = source_entry.downloader
|
|
206
|
-
return downloader_cls(**downloader_kwargs)
|
|
207
|
-
|
|
208
|
-
@staticmethod
|
|
209
|
-
def get_custom_stager(
|
|
210
|
-
stager_reference: str, stager_config_kwargs: Optional[dict] = None
|
|
211
|
-
) -> Optional[UploadStagerT]:
|
|
212
|
-
uploader_cls = import_from_string(stager_reference)
|
|
213
|
-
if not inspect.isclass(uploader_cls):
|
|
214
|
-
raise ValueError(
|
|
215
|
-
f"custom stager must be a reference to a python class, got: {type(uploader_cls)}"
|
|
216
|
-
)
|
|
217
|
-
if not issubclass(uploader_cls, UploadStager):
|
|
218
|
-
raise ValueError(
|
|
219
|
-
"custom stager must be an implementation of the UploadStager interface"
|
|
220
|
-
)
|
|
221
|
-
fields_dict = {f.name: f.type for f in fields(uploader_cls)}
|
|
222
|
-
upload_stager_config_cls = fields_dict["upload_stager_config"]
|
|
223
|
-
if not inspect.isclass(upload_stager_config_cls):
|
|
224
|
-
raise ValueError(
|
|
225
|
-
f"custom stager config must be a class, got: {type(upload_stager_config_cls)}"
|
|
226
|
-
)
|
|
227
|
-
if not issubclass(upload_stager_config_cls, UploadStagerConfig):
|
|
228
|
-
raise ValueError(
|
|
229
|
-
"custom stager config must be an implementation "
|
|
230
|
-
"of the UploadStagerUploadStagerConfig interface"
|
|
231
|
-
)
|
|
232
|
-
upload_stager_kwargs: dict[str, Any] = {}
|
|
233
|
-
if stager_config_kwargs:
|
|
234
|
-
upload_stager_kwargs["upload_stager_config"] = upload_stager_config_cls(
|
|
235
|
-
**stager_config_kwargs
|
|
236
|
-
)
|
|
237
|
-
return uploader_cls(**upload_stager_kwargs)
|
|
238
|
-
|
|
239
|
-
@staticmethod
|
|
240
|
-
def get_upload_stager(dest: str, options: dict[str, Any]) -> Optional[UploadStagerT]:
|
|
241
|
-
if custom_stager := options.get("custom_stager"):
|
|
242
|
-
return BaseCmd.get_custom_stager(
|
|
243
|
-
stager_reference=custom_stager,
|
|
244
|
-
stager_config_kwargs=options.get("custom_stager_config_kwargs"),
|
|
245
|
-
)
|
|
246
|
-
dest_entry = destination_registry[dest]
|
|
247
|
-
upload_stager_kwargs: dict[str, Any] = {}
|
|
248
|
-
if upload_stager_config_cls := dest_entry.upload_stager_config:
|
|
249
|
-
upload_stager_kwargs["upload_stager_config"] = extract_config(
|
|
250
|
-
flat_data=options, config=upload_stager_config_cls
|
|
251
|
-
)
|
|
252
|
-
if upload_stager_cls := dest_entry.upload_stager:
|
|
253
|
-
return upload_stager_cls(**upload_stager_kwargs)
|
|
254
|
-
return None
|
|
255
|
-
|
|
256
|
-
@staticmethod
|
|
257
|
-
def get_uploader(dest, options: dict[str, Any]) -> UploaderT:
|
|
258
|
-
dest_entry = destination_registry[dest]
|
|
259
|
-
uploader_kwargs: dict[str, Any] = {}
|
|
260
|
-
if uploader_config_cls := dest_entry.uploader_config:
|
|
261
|
-
uploader_kwargs["upload_config"] = extract_config(
|
|
262
|
-
flat_data=options, config=uploader_config_cls
|
|
263
|
-
)
|
|
264
|
-
if connection_config_cls := dest_entry.connection_config:
|
|
265
|
-
uploader_kwargs["connection_config"] = extract_config(
|
|
266
|
-
flat_data=options, config=connection_config_cls
|
|
267
|
-
)
|
|
268
|
-
uploader_cls = dest_entry.uploader
|
|
269
|
-
return uploader_cls(**uploader_kwargs)
|