unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +49 -0
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/processes/connectors/github.py +221 -0
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
- unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
unstructured_ingest/cli/utils.py
DELETED
|
@@ -1,205 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import fields, is_dataclass
|
|
3
|
-
from gettext import gettext as _
|
|
4
|
-
|
|
5
|
-
import click
|
|
6
|
-
|
|
7
|
-
from unstructured_ingest.cli.interfaces import (
|
|
8
|
-
CliChunkingConfig,
|
|
9
|
-
CliConfig,
|
|
10
|
-
CliEmbeddingConfig,
|
|
11
|
-
CliPartitionConfig,
|
|
12
|
-
CliPermissionsConfig,
|
|
13
|
-
CliProcessorConfig,
|
|
14
|
-
CliReadConfig,
|
|
15
|
-
CliRetryStrategyConfig,
|
|
16
|
-
)
|
|
17
|
-
from unstructured_ingest.interfaces import BaseConfig
|
|
18
|
-
from unstructured_ingest.logger import logger
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def conform_click_options(options: dict):
|
|
22
|
-
# Click sets all multiple fields as tuple, this needs to be updated to list
|
|
23
|
-
for k, v in options.items():
|
|
24
|
-
if isinstance(v, tuple):
|
|
25
|
-
options[k] = list(v)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def extract_config(flat_data: dict, config: t.Type[BaseConfig]) -> BaseConfig:
|
|
29
|
-
"""
|
|
30
|
-
To be able to extract a nested dataclass from a flat dictionary (as in one coming
|
|
31
|
-
from a click-based options input), the config class is dynamically looked through for
|
|
32
|
-
nested dataclass fields and new nested dictionaries are created to conform to the
|
|
33
|
-
shape the overall class expects when parsing from a dict. During the process, this will create
|
|
34
|
-
copies of the original dictionary to avoid pruning fields but this isn't a
|
|
35
|
-
problem since the `from_dict()` method ignores unneeded values.
|
|
36
|
-
|
|
37
|
-
Not handling more complex edge cases for now such as nested types i.e Union[List[List[...]]]
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
def conform_dict(inner_d: dict, inner_config: t.Type[BaseConfig]):
|
|
41
|
-
# Catch edge cases (i.e. Dict[str, ...]) where underlying type is not a concrete Class,
|
|
42
|
-
# causing 'issubclass() arg 1 must be a class' errors, return False
|
|
43
|
-
def is_subclass(instance, class_type) -> bool:
|
|
44
|
-
try:
|
|
45
|
-
return issubclass(instance, class_type)
|
|
46
|
-
except Exception:
|
|
47
|
-
return False
|
|
48
|
-
|
|
49
|
-
dd = inner_d.copy()
|
|
50
|
-
for field in fields(inner_config):
|
|
51
|
-
f_type = field.type
|
|
52
|
-
# Handle the case where the type of a value if a Union (possibly optional)
|
|
53
|
-
if t.get_origin(f_type) is t.Union:
|
|
54
|
-
union_values = t.get_args(f_type)
|
|
55
|
-
# handle List types
|
|
56
|
-
union_values = [
|
|
57
|
-
t.get_args(u)[0] if t.get_origin(u) is list else u for u in union_values
|
|
58
|
-
]
|
|
59
|
-
# Ignore injected NoneType when optional
|
|
60
|
-
concrete_union_values = [v for v in union_values if not is_subclass(v, type(None))]
|
|
61
|
-
dataclass_union_values = [v for v in concrete_union_values if is_dataclass(v)]
|
|
62
|
-
non_dataclass_union_values = [
|
|
63
|
-
v for v in concrete_union_values if not is_dataclass(v)
|
|
64
|
-
]
|
|
65
|
-
if not dataclass_union_values:
|
|
66
|
-
continue
|
|
67
|
-
# Check if the key for this field already exists in the dictionary,
|
|
68
|
-
# if so it might map to one of these non dataclass fields and this
|
|
69
|
-
# can't be enforced
|
|
70
|
-
if non_dataclass_union_values and field.name in dd:
|
|
71
|
-
continue
|
|
72
|
-
if len(dataclass_union_values) > 1:
|
|
73
|
-
logger.warning(
|
|
74
|
-
"more than one dataclass type possible for field {}, "
|
|
75
|
-
"not extracting: {}".format(field.name, ", ".join(dataclass_union_values))
|
|
76
|
-
)
|
|
77
|
-
continue
|
|
78
|
-
f_type = dataclass_union_values[0]
|
|
79
|
-
origin = t.get_origin(f_type)
|
|
80
|
-
if origin:
|
|
81
|
-
f_type = origin
|
|
82
|
-
if is_subclass(f_type, BaseConfig):
|
|
83
|
-
dd[field.name] = conform_dict(inner_d=dd, inner_config=f_type)
|
|
84
|
-
return dd
|
|
85
|
-
|
|
86
|
-
adjusted_dict = conform_dict(inner_d=flat_data, inner_config=config)
|
|
87
|
-
return config.from_dict(adjusted_dict, apply_name_overload=False)
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def extract_configs(
|
|
91
|
-
data: dict,
|
|
92
|
-
extras: t.Optional[t.Dict[str, t.Type[BaseConfig]]] = None,
|
|
93
|
-
validate: t.Optional[t.List[t.Type[BaseConfig]]] = None,
|
|
94
|
-
add_defaults: bool = True,
|
|
95
|
-
) -> t.Dict[str, BaseConfig]:
|
|
96
|
-
"""
|
|
97
|
-
Extract all common configs used across CLI command and validate that any
|
|
98
|
-
command-specific configs have all their needed information from the Click
|
|
99
|
-
options that are passed in during invocation.
|
|
100
|
-
"""
|
|
101
|
-
validate = validate if validate else []
|
|
102
|
-
res = (
|
|
103
|
-
{
|
|
104
|
-
"read_config": extract_config(flat_data=data, config=CliReadConfig),
|
|
105
|
-
"partition_config": extract_config(flat_data=data, config=CliPartitionConfig),
|
|
106
|
-
"embedding_config": extract_config(flat_data=data, config=CliEmbeddingConfig),
|
|
107
|
-
"chunking_config": extract_config(flat_data=data, config=CliChunkingConfig),
|
|
108
|
-
"processor_config": extract_config(flat_data=data, config=CliProcessorConfig),
|
|
109
|
-
"permissions_config": extract_config(flat_data=data, config=CliPermissionsConfig),
|
|
110
|
-
"retry_strategy_config": extract_config(flat_data=data, config=CliRetryStrategyConfig),
|
|
111
|
-
}
|
|
112
|
-
if add_defaults
|
|
113
|
-
else {}
|
|
114
|
-
)
|
|
115
|
-
if extras:
|
|
116
|
-
for k, conf in extras.items():
|
|
117
|
-
try:
|
|
118
|
-
res[k] = extract_config(flat_data=data, config=conf)
|
|
119
|
-
except Exception as e:
|
|
120
|
-
logger.error(f"failed to extract config from {conf.__name__}")
|
|
121
|
-
raise e
|
|
122
|
-
for v in validate:
|
|
123
|
-
try:
|
|
124
|
-
extract_config(flat_data=data, config=v)
|
|
125
|
-
except Exception as e:
|
|
126
|
-
raise Exception(f"failed to validate config {v.__name__}") from e
|
|
127
|
-
|
|
128
|
-
return res
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
def add_options(
|
|
132
|
-
cmd: click.Command, extras: t.List[t.Type[CliConfig]], is_src: bool = True
|
|
133
|
-
) -> click.Command:
|
|
134
|
-
configs: t.List[t.Type[CliConfig]] = (
|
|
135
|
-
[
|
|
136
|
-
CliPartitionConfig,
|
|
137
|
-
CliReadConfig,
|
|
138
|
-
CliEmbeddingConfig,
|
|
139
|
-
CliChunkingConfig,
|
|
140
|
-
CliProcessorConfig,
|
|
141
|
-
CliPermissionsConfig,
|
|
142
|
-
CliRetryStrategyConfig,
|
|
143
|
-
]
|
|
144
|
-
if is_src
|
|
145
|
-
else []
|
|
146
|
-
)
|
|
147
|
-
# make sure what's unique to this cmd appears first
|
|
148
|
-
extras.extend(configs)
|
|
149
|
-
for config in extras:
|
|
150
|
-
try:
|
|
151
|
-
config.add_cli_options(cmd=cmd)
|
|
152
|
-
except ValueError as e:
|
|
153
|
-
raise ValueError(f"failed to set configs from {config.__name__}: {e}")
|
|
154
|
-
return cmd
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
class Group(click.Group):
|
|
158
|
-
def parse_args(self, ctx, args):
|
|
159
|
-
"""
|
|
160
|
-
This allows for subcommands to be called with the --help flag without breaking
|
|
161
|
-
if parent command is missing any of its required parameters
|
|
162
|
-
"""
|
|
163
|
-
|
|
164
|
-
try:
|
|
165
|
-
return super().parse_args(ctx, args)
|
|
166
|
-
except click.MissingParameter:
|
|
167
|
-
if "--help" not in args:
|
|
168
|
-
raise
|
|
169
|
-
|
|
170
|
-
# remove the required params so that help can display
|
|
171
|
-
for param in self.params:
|
|
172
|
-
param.required = False
|
|
173
|
-
return super().parse_args(ctx, args)
|
|
174
|
-
|
|
175
|
-
def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
|
|
176
|
-
"""
|
|
177
|
-
Copy of the original click.Group format_commands() method but replacing
|
|
178
|
-
'Commands' -> 'Destinations'
|
|
179
|
-
"""
|
|
180
|
-
commands = []
|
|
181
|
-
for subcommand in self.list_commands(ctx):
|
|
182
|
-
cmd = self.get_command(ctx, subcommand)
|
|
183
|
-
# What is this, the tool lied about a command. Ignore it
|
|
184
|
-
if cmd is None:
|
|
185
|
-
continue
|
|
186
|
-
if cmd.hidden:
|
|
187
|
-
continue
|
|
188
|
-
|
|
189
|
-
commands.append((subcommand, cmd))
|
|
190
|
-
|
|
191
|
-
# allow for 3 times the default spacing
|
|
192
|
-
if len(commands):
|
|
193
|
-
if formatter.width:
|
|
194
|
-
limit = formatter.width - 6 - max(len(cmd[0]) for cmd in commands)
|
|
195
|
-
else:
|
|
196
|
-
limit = -6 - max(len(cmd[0]) for cmd in commands)
|
|
197
|
-
|
|
198
|
-
rows = []
|
|
199
|
-
for subcommand, cmd in commands:
|
|
200
|
-
help = cmd.get_short_help_str(limit)
|
|
201
|
-
rows.append((subcommand, help))
|
|
202
|
-
|
|
203
|
-
if rows:
|
|
204
|
-
with formatter.section(_("Destinations")):
|
|
205
|
-
formatter.write_dl(rows)
|
|
@@ -1,309 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass, field
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
7
|
-
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
8
|
-
from unstructured_ingest.interfaces import (
|
|
9
|
-
AccessConfig,
|
|
10
|
-
BaseConnectorConfig,
|
|
11
|
-
BaseSingleIngestDoc,
|
|
12
|
-
BaseSourceConnector,
|
|
13
|
-
IngestDocCleanupMixin,
|
|
14
|
-
SourceConnectorCleanupMixin,
|
|
15
|
-
SourceMetadata,
|
|
16
|
-
)
|
|
17
|
-
from unstructured_ingest.logger import logger
|
|
18
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
19
|
-
|
|
20
|
-
if t.TYPE_CHECKING:
|
|
21
|
-
from pyairtable import Api
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@dataclass
|
|
25
|
-
class AirtableAccessConfig(AccessConfig):
|
|
26
|
-
personal_access_token: str = enhanced_field(sensitive=True)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@dataclass
|
|
30
|
-
class SimpleAirtableConfig(BaseConnectorConfig):
|
|
31
|
-
"""Connector config where:
|
|
32
|
-
auth_token is the authentication token to authenticate into Airtable.
|
|
33
|
-
|
|
34
|
-
Check https://support.airtable.com/docs/airtable-api-key-deprecation-notice
|
|
35
|
-
for more info on authentication.
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
access_config: AirtableAccessConfig
|
|
39
|
-
list_of_paths: t.Optional[str] = None
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@dataclass
|
|
43
|
-
class AirtableTableMeta:
|
|
44
|
-
"""Metadata specifying a table id, a base id which the table is stored in,
|
|
45
|
-
and an t.Optional view id in case particular rows and fields are to be ingested"""
|
|
46
|
-
|
|
47
|
-
base_id: str
|
|
48
|
-
table_id: str
|
|
49
|
-
view_id: t.Optional[str] = None
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@dataclass
|
|
53
|
-
class AirtableIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
54
|
-
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
55
|
-
doing the processing).
|
|
56
|
-
|
|
57
|
-
Current implementation creates an Airtable connection object
|
|
58
|
-
to fetch each document, rather than creating a it for each thread.
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
connector_config: SimpleAirtableConfig
|
|
62
|
-
table_meta: AirtableTableMeta
|
|
63
|
-
registry_name: str = "airtable"
|
|
64
|
-
|
|
65
|
-
@property
|
|
66
|
-
def filename(self):
|
|
67
|
-
return (
|
|
68
|
-
Path(self.read_config.download_dir)
|
|
69
|
-
/ self.table_meta.base_id
|
|
70
|
-
/ f"{self.table_meta.table_id}.csv"
|
|
71
|
-
).resolve()
|
|
72
|
-
|
|
73
|
-
@property
|
|
74
|
-
def _output_filename(self):
|
|
75
|
-
"""Create output file path based on output directory, base id, and table id"""
|
|
76
|
-
output_file = f"{self.table_meta.table_id}.json"
|
|
77
|
-
return Path(self.processor_config.output_dir) / self.table_meta.base_id / output_file
|
|
78
|
-
|
|
79
|
-
@property
|
|
80
|
-
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
81
|
-
return {
|
|
82
|
-
"base_id": self.table_meta.base_id,
|
|
83
|
-
"table_id": self.table_meta.table_id,
|
|
84
|
-
"view_id": self.table_meta.view_id,
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
@property
|
|
88
|
-
def version(self) -> t.Optional[str]:
|
|
89
|
-
return None
|
|
90
|
-
|
|
91
|
-
@requires_dependencies(["pyairtable"], extras="airtable")
|
|
92
|
-
def _query_table(self):
|
|
93
|
-
from pyairtable import Api
|
|
94
|
-
|
|
95
|
-
api = Api(self.connector_config.access_config.personal_access_token)
|
|
96
|
-
table = api.table(self.table_meta.base_id, self.table_meta.table_id)
|
|
97
|
-
table_url = table.url
|
|
98
|
-
rows = table.all(
|
|
99
|
-
view=self.table_meta.view_id,
|
|
100
|
-
)
|
|
101
|
-
return rows, table_url
|
|
102
|
-
|
|
103
|
-
@SourceConnectionNetworkError.wrap
|
|
104
|
-
def _get_table_rows(self):
|
|
105
|
-
rows, table_url = self._query_table()
|
|
106
|
-
|
|
107
|
-
if len(rows) == 0:
|
|
108
|
-
logger.info("Empty document, retrieved table but it has no rows.")
|
|
109
|
-
return rows, table_url
|
|
110
|
-
|
|
111
|
-
def update_source_metadata(self, **kwargs):
|
|
112
|
-
"""Gets file metadata from the current table."""
|
|
113
|
-
|
|
114
|
-
rows, table_url = kwargs.get("rows_tuple", self._get_table_rows())
|
|
115
|
-
if rows is None or len(rows) < 1:
|
|
116
|
-
self.source_metadata = SourceMetadata(
|
|
117
|
-
exists=False,
|
|
118
|
-
)
|
|
119
|
-
return
|
|
120
|
-
dates = [r.get("createdTime", "") for r in rows]
|
|
121
|
-
dates.sort()
|
|
122
|
-
|
|
123
|
-
date_created = datetime.strptime(
|
|
124
|
-
dates[0],
|
|
125
|
-
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
126
|
-
).isoformat()
|
|
127
|
-
|
|
128
|
-
date_modified = datetime.strptime(
|
|
129
|
-
dates[-1],
|
|
130
|
-
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
131
|
-
).isoformat()
|
|
132
|
-
|
|
133
|
-
self.source_metadata = SourceMetadata(
|
|
134
|
-
date_created=date_created,
|
|
135
|
-
date_modified=date_modified,
|
|
136
|
-
source_url=table_url,
|
|
137
|
-
exists=True,
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
@SourceConnectionError.wrap
|
|
141
|
-
@requires_dependencies(["pandas"])
|
|
142
|
-
@BaseSingleIngestDoc.skip_if_file_exists
|
|
143
|
-
def get_file(self):
|
|
144
|
-
import pandas as pd
|
|
145
|
-
|
|
146
|
-
rows, table_url = self._get_table_rows()
|
|
147
|
-
self.update_source_metadata(rows_tuple=(rows, table_url))
|
|
148
|
-
if rows is None:
|
|
149
|
-
raise ValueError(
|
|
150
|
-
"Failed to retrieve rows from table "
|
|
151
|
-
f"{self.table_meta.base_id}/{self.table_meta.table_id}. Check logs",
|
|
152
|
-
)
|
|
153
|
-
# NOTE: Might be a good idea to add pagination for large tables
|
|
154
|
-
df = pd.DataFrame.from_dict(
|
|
155
|
-
[row["fields"] for row in rows],
|
|
156
|
-
).sort_index(axis=1)
|
|
157
|
-
|
|
158
|
-
self.document = df.to_csv()
|
|
159
|
-
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
160
|
-
|
|
161
|
-
with open(self.filename, "w", encoding="utf8") as f:
|
|
162
|
-
f.write(self.document)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
airtable_id_prefixes = ["app", "tbl", "viw"]
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
def raise_airtable_path_error(piece):
|
|
169
|
-
if any(piece[:3] == prefix for prefix in airtable_id_prefixes):
|
|
170
|
-
raise (
|
|
171
|
-
ValueError(
|
|
172
|
-
"Path components are not correctly ordered.\
|
|
173
|
-
Valid path structures: \
|
|
174
|
-
- base_id/table_id/view_id , \
|
|
175
|
-
- base_id/table_id, \
|
|
176
|
-
- base_id .\
|
|
177
|
-
It is also possible to leave --airtable-list-of-paths \
|
|
178
|
-
argument empty (this will ingest everything).",
|
|
179
|
-
)
|
|
180
|
-
)
|
|
181
|
-
else:
|
|
182
|
-
raise (
|
|
183
|
-
ValueError(
|
|
184
|
-
"""Path components are not valid Airtable ids.
|
|
185
|
-
base_id should look like: appAbcDeF1ghijKlm,
|
|
186
|
-
table_id should look like: tblAbcDeF1ghijKlm,
|
|
187
|
-
view_id should look like: viwAbcDeF1ghijKlm""",
|
|
188
|
-
)
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def check_path_validity(path):
|
|
193
|
-
pieces = path.split("/")
|
|
194
|
-
assert (
|
|
195
|
-
1 <= len(pieces) <= 3
|
|
196
|
-
), "Path should be composed of between 1-3 \
|
|
197
|
-
components (base_id, table_id, view_id)."
|
|
198
|
-
|
|
199
|
-
for i, piece in enumerate(pieces):
|
|
200
|
-
try:
|
|
201
|
-
assert piece[:3] == airtable_id_prefixes[i]
|
|
202
|
-
except AssertionError:
|
|
203
|
-
raise_airtable_path_error(piece)
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
@dataclass
|
|
207
|
-
class AirtableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
208
|
-
"""Fetches tables or views from an Airtable org."""
|
|
209
|
-
|
|
210
|
-
connector_config: SimpleAirtableConfig
|
|
211
|
-
_api: t.Optional["Api"] = field(init=False, default=None)
|
|
212
|
-
|
|
213
|
-
@property
|
|
214
|
-
def api(self):
|
|
215
|
-
if self._api is None:
|
|
216
|
-
self._api = Api(self.connector_config.access_config.personal_access_token)
|
|
217
|
-
return self._api
|
|
218
|
-
|
|
219
|
-
@api.setter
|
|
220
|
-
def api(self, api: "Api"):
|
|
221
|
-
self._api = api
|
|
222
|
-
|
|
223
|
-
def check_connection(self):
|
|
224
|
-
import requests
|
|
225
|
-
|
|
226
|
-
try:
|
|
227
|
-
self.api.request(method="HEAD", url=self.api.build_url("meta", "bases"))
|
|
228
|
-
except requests.HTTPError as http_error:
|
|
229
|
-
logger.error(f"failed to validate connection: {http_error}", exc_info=True)
|
|
230
|
-
raise SourceConnectionError(f"failed to validate connection: {http_error}")
|
|
231
|
-
|
|
232
|
-
@requires_dependencies(["pyairtable"], extras="airtable")
|
|
233
|
-
def initialize(self):
|
|
234
|
-
from pyairtable import Api
|
|
235
|
-
|
|
236
|
-
self.base_ids_to_fetch_tables_from = []
|
|
237
|
-
if self.connector_config.list_of_paths:
|
|
238
|
-
self.list_of_paths = self.connector_config.list_of_paths.split()
|
|
239
|
-
|
|
240
|
-
self.api = Api(self.connector_config.access_config.personal_access_token)
|
|
241
|
-
|
|
242
|
-
@requires_dependencies(["pyairtable"], extras="airtable")
|
|
243
|
-
def use_all_bases(self):
|
|
244
|
-
from pyairtable.metadata import get_api_bases
|
|
245
|
-
|
|
246
|
-
self.base_ids_to_fetch_tables_from = [
|
|
247
|
-
base["id"] for base in get_api_bases(self.api)["bases"]
|
|
248
|
-
]
|
|
249
|
-
|
|
250
|
-
@requires_dependencies(["pyairtable"], extras="airtable")
|
|
251
|
-
def fetch_table_ids(self):
|
|
252
|
-
from pyairtable.metadata import get_base_schema
|
|
253
|
-
|
|
254
|
-
bases = [
|
|
255
|
-
(base_id, self.api.base(base_id)) for base_id in self.base_ids_to_fetch_tables_from
|
|
256
|
-
]
|
|
257
|
-
|
|
258
|
-
metadata_for_each_base = [
|
|
259
|
-
(base_id, get_base_schema(base)["tables"]) for base_id, base in bases
|
|
260
|
-
]
|
|
261
|
-
|
|
262
|
-
baseid_tableid_viewid_tuples = [
|
|
263
|
-
(base_id, table["id"], None)
|
|
264
|
-
for base_id, base_metadata in metadata_for_each_base
|
|
265
|
-
for table in base_metadata
|
|
266
|
-
]
|
|
267
|
-
|
|
268
|
-
return baseid_tableid_viewid_tuples
|
|
269
|
-
|
|
270
|
-
def get_ingest_docs(self):
|
|
271
|
-
"""Fetches documents in an Airtable org."""
|
|
272
|
-
|
|
273
|
-
# When no list of paths provided, the connector ingests everything.
|
|
274
|
-
if not self.connector_config.list_of_paths:
|
|
275
|
-
self.use_all_bases()
|
|
276
|
-
baseid_tableid_viewid_tuples = self.fetch_table_ids()
|
|
277
|
-
|
|
278
|
-
# When there is a list of paths, the connector checks the validity
|
|
279
|
-
# of the paths, and fetches table_ids to be ingested, based on the paths.
|
|
280
|
-
else:
|
|
281
|
-
self.paths = self.connector_config.list_of_paths.split()
|
|
282
|
-
self.paths = [path.strip("/") for path in self.paths]
|
|
283
|
-
|
|
284
|
-
[check_path_validity(path) for path in self.paths]
|
|
285
|
-
|
|
286
|
-
self.base_ids_to_fetch_tables_from = []
|
|
287
|
-
baseid_tableid_viewid_tuples = []
|
|
288
|
-
|
|
289
|
-
for path in self.paths:
|
|
290
|
-
components = path.split("/")
|
|
291
|
-
if len(components) == 1: # only a base_id is provided
|
|
292
|
-
self.base_ids_to_fetch_tables_from.append(components[0])
|
|
293
|
-
elif len(components) == 2: # a base_id and a table_id are provided
|
|
294
|
-
baseid_tableid_viewid_tuples.append((components[0], components[1], None))
|
|
295
|
-
elif len(components) == 3: # a base_id, table_id, and a view_id are provided
|
|
296
|
-
baseid_tableid_viewid_tuples.append(
|
|
297
|
-
(components[0], components[1], components[2]),
|
|
298
|
-
)
|
|
299
|
-
|
|
300
|
-
baseid_tableid_viewid_tuples += self.fetch_table_ids()
|
|
301
|
-
return [
|
|
302
|
-
AirtableIngestDoc(
|
|
303
|
-
processor_config=self.processor_config,
|
|
304
|
-
connector_config=self.connector_config,
|
|
305
|
-
read_config=self.read_config,
|
|
306
|
-
table_meta=AirtableTableMeta(base_id, table_id, view_id),
|
|
307
|
-
)
|
|
308
|
-
for base_id, table_id, view_id in baseid_tableid_viewid_tuples
|
|
309
|
-
]
|