unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +1 -1
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
- unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -8,9 +8,7 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
8
8
|
BaseEmbeddingEncoder,
|
|
9
9
|
EmbeddingConfig,
|
|
10
10
|
)
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
-
from unstructured_ingest.v2.errors import (
|
|
11
|
+
from unstructured_ingest.errors_v2 import (
|
|
14
12
|
ProviderError,
|
|
15
13
|
QuotaError,
|
|
16
14
|
RateLimitError,
|
|
@@ -18,6 +16,8 @@ from unstructured_ingest.v2.errors import (
|
|
|
18
16
|
UserError,
|
|
19
17
|
is_internal_error,
|
|
20
18
|
)
|
|
19
|
+
from unstructured_ingest.logger import logger
|
|
20
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
21
|
|
|
22
22
|
if TYPE_CHECKING:
|
|
23
23
|
from openai import AsyncOpenAI, OpenAI
|
|
@@ -8,9 +8,7 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
8
8
|
BaseEmbeddingEncoder,
|
|
9
9
|
EmbeddingConfig,
|
|
10
10
|
)
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
-
from unstructured_ingest.v2.errors import (
|
|
11
|
+
from unstructured_ingest.errors_v2 import (
|
|
14
12
|
ProviderError,
|
|
15
13
|
QuotaError,
|
|
16
14
|
RateLimitError,
|
|
@@ -18,6 +16,8 @@ from unstructured_ingest.v2.errors import (
|
|
|
18
16
|
UserError,
|
|
19
17
|
is_internal_error,
|
|
20
18
|
)
|
|
19
|
+
from unstructured_ingest.logger import logger
|
|
20
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
21
|
|
|
22
22
|
if TYPE_CHECKING:
|
|
23
23
|
from openai import AsyncOpenAI, OpenAI
|
|
@@ -8,12 +8,12 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
8
8
|
BaseEmbeddingEncoder,
|
|
9
9
|
EmbeddingConfig,
|
|
10
10
|
)
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
-
from unstructured_ingest.v2.errors import (
|
|
11
|
+
from unstructured_ingest.errors_v2 import (
|
|
14
12
|
RateLimitError as CustomRateLimitError,
|
|
15
13
|
)
|
|
16
|
-
from unstructured_ingest.
|
|
14
|
+
from unstructured_ingest.errors_v2 import UserAuthError, UserError, is_internal_error
|
|
15
|
+
from unstructured_ingest.logger import logger
|
|
16
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from together import AsyncTogether, Together
|
|
@@ -13,8 +13,8 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
13
13
|
BaseEmbeddingEncoder,
|
|
14
14
|
EmbeddingConfig,
|
|
15
15
|
)
|
|
16
|
+
from unstructured_ingest.errors_v2 import UserAuthError, is_internal_error
|
|
16
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
-
from unstructured_ingest.v2.errors import UserAuthError, is_internal_error
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
from vertexai.language_models import TextEmbeddingModel
|
|
@@ -8,12 +8,12 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
8
8
|
BaseEmbeddingEncoder,
|
|
9
9
|
EmbeddingConfig,
|
|
10
10
|
)
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
from unstructured_ingest.
|
|
13
|
-
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError, is_internal_error
|
|
14
|
-
from unstructured_ingest.v2.errors import (
|
|
11
|
+
from unstructured_ingest.errors_v2 import ProviderError, UserAuthError, UserError, is_internal_error
|
|
12
|
+
from unstructured_ingest.errors_v2 import (
|
|
15
13
|
RateLimitError as CustomRateLimitError,
|
|
16
14
|
)
|
|
15
|
+
from unstructured_ingest.logger import logger
|
|
16
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from voyageai import AsyncClient as AsyncVoyageAIClient
|
|
@@ -5,9 +5,9 @@ from typing import Any, Optional, TypedDict, TypeVar, Union
|
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, Field
|
|
7
7
|
|
|
8
|
-
from unstructured_ingest.
|
|
9
|
-
from unstructured_ingest.
|
|
10
|
-
from unstructured_ingest.
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.interfaces.connector import BaseConnector
|
|
10
|
+
from unstructured_ingest.interfaces.process import BaseProcess
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class DownloaderConfig(BaseModel):
|
|
@@ -3,9 +3,9 @@ from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.
|
|
7
|
-
from unstructured_ingest.
|
|
8
|
-
from unstructured_ingest.
|
|
6
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
7
|
+
from unstructured_ingest.interfaces.connector import BaseConnector
|
|
8
|
+
from unstructured_ingest.interfaces.process import BaseProcess
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class IndexerConfig(BaseModel):
|
|
@@ -5,10 +5,10 @@ from typing import Any, TypeVar
|
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.interfaces import BaseProcess
|
|
8
10
|
from unstructured_ingest.utils import ndjson
|
|
9
11
|
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
10
|
-
from unstructured_ingest.v2.interfaces import BaseProcess
|
|
11
|
-
from unstructured_ingest.v2.types.file_data import FileData
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class UploadStagerConfig(BaseModel):
|
|
@@ -5,9 +5,9 @@ from typing import Any, TypeVar
|
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.interfaces import BaseConnector, BaseProcess
|
|
8
10
|
from unstructured_ingest.utils.data_prep import get_data
|
|
9
|
-
from unstructured_ingest.v2.interfaces import BaseConnector, BaseProcess
|
|
10
|
-
from unstructured_ingest.v2.types.file_data import FileData
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class UploaderConfig(BaseModel):
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
from .doc_factory import DocFactory
|
|
2
|
-
from .interfaces import PipelineContext, ReformatNode
|
|
3
|
-
from .partition import Partitioner
|
|
4
|
-
from .permissions import PermissionsDataCleaner
|
|
5
|
-
from .pipeline import Pipeline
|
|
6
|
-
from .reformat.chunking import Chunker
|
|
7
|
-
from .reformat.embedding import Embedder
|
|
8
|
-
from .source import Reader
|
|
9
|
-
from .write import Writer
|
|
10
|
-
|
|
11
|
-
__all__ = [
|
|
12
|
-
"DocFactory",
|
|
13
|
-
"Partitioner",
|
|
14
|
-
"Reader",
|
|
15
|
-
"Embedder",
|
|
16
|
-
"PipelineContext",
|
|
17
|
-
"Pipeline",
|
|
18
|
-
"Writer",
|
|
19
|
-
"Chunker",
|
|
20
|
-
"ReformatNode",
|
|
21
|
-
"PermissionsDataCleaner",
|
|
22
|
-
]
|
|
@@ -1,270 +1,211 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
3
4
|
import logging
|
|
4
5
|
import multiprocessing as mp
|
|
5
|
-
import
|
|
6
|
+
import shutil
|
|
6
7
|
from abc import ABC, abstractmethod
|
|
7
|
-
from
|
|
8
|
-
from
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from dataclasses import dataclass
|
|
9
10
|
from pathlib import Path
|
|
11
|
+
from typing import Any, Awaitable, Callable, Optional, TypeVar
|
|
10
12
|
|
|
11
|
-
from
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
from tqdm.asyncio import tqdm as tqdm_asyncio
|
|
12
15
|
|
|
13
|
-
from unstructured_ingest.
|
|
14
|
-
from unstructured_ingest.
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
PartitionConfig,
|
|
18
|
-
ProcessorConfig,
|
|
19
|
-
ReadConfig,
|
|
20
|
-
RetryStrategyConfig,
|
|
21
|
-
)
|
|
22
|
-
from unstructured_ingest.logger import ingest_log_streaming_init, logger
|
|
16
|
+
from unstructured_ingest.interfaces import BaseProcess, ProcessorConfig, Uploader
|
|
17
|
+
from unstructured_ingest.logger import logger, make_default_logger
|
|
18
|
+
from unstructured_ingest.otel import OtelHandler
|
|
19
|
+
from unstructured_ingest.pipeline.otel import instrument
|
|
23
20
|
|
|
24
|
-
|
|
25
|
-
|
|
21
|
+
BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess)
|
|
22
|
+
iterable_input = list[dict[str, Any]]
|
|
26
23
|
|
|
27
24
|
|
|
28
25
|
@dataclass
|
|
29
|
-
class
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def __post_init__(self):
|
|
35
|
-
self._ingest_docs_map: t.Optional[DictProxy] = None
|
|
26
|
+
class PipelineStep(ABC):
|
|
27
|
+
process: BaseProcessT
|
|
28
|
+
context: ProcessorConfig
|
|
29
|
+
identifier: str
|
|
36
30
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
if self._ingest_docs_map is None:
|
|
40
|
-
raise ValueError("ingest_docs_map never initialized")
|
|
41
|
-
return self._ingest_docs_map
|
|
31
|
+
def __str__(self):
|
|
32
|
+
return self.identifier
|
|
42
33
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
34
|
+
def process_serially(self, iterable: iterable_input) -> Any:
|
|
35
|
+
logger.info("processing content serially")
|
|
36
|
+
if iterable:
|
|
37
|
+
if len(iterable) == 1:
|
|
38
|
+
return [self.run(**iterable[0])]
|
|
39
|
+
if self.context.tqdm:
|
|
40
|
+
return [self.run(**it) for it in tqdm(iterable, desc=self.identifier)]
|
|
41
|
+
return [self.run(**it) for it in iterable]
|
|
42
|
+
return [self.run()]
|
|
43
|
+
|
|
44
|
+
async def _process_async(self, iterable: iterable_input) -> Any:
|
|
45
|
+
if iterable:
|
|
46
|
+
if len(iterable) == 1:
|
|
47
|
+
return [await self.run_async(**iterable[0])]
|
|
48
|
+
if self.context.tqdm:
|
|
49
|
+
return await tqdm_asyncio.gather(
|
|
50
|
+
*[self.run_async(**i) for i in iterable], desc=self.identifier
|
|
51
|
+
)
|
|
52
|
+
return await asyncio.gather(*[self.run_async(**i) for i in iterable])
|
|
53
|
+
return [await self.run_async()]
|
|
54
|
+
|
|
55
|
+
def process_async(self, iterable: iterable_input) -> Any:
|
|
56
|
+
logger.info("processing content async")
|
|
57
|
+
return self.asyncio_run(fn=self._process_async, iterable=iterable)
|
|
58
|
+
|
|
59
|
+
def asyncio_run(
|
|
60
|
+
self, fn: Callable[[Any, Any], Awaitable[Any]], *args: Any, **kwargs: Any
|
|
61
|
+
) -> Any:
|
|
62
|
+
current_loop = asyncio._get_running_loop()
|
|
63
|
+
if current_loop is None:
|
|
64
|
+
return asyncio.run(fn(*args, **kwargs))
|
|
65
|
+
with ThreadPoolExecutor(thread_name_prefix="asyncio") as thread_pool:
|
|
66
|
+
logger.warning(
|
|
67
|
+
f"async code being run in dedicated thread pool "
|
|
68
|
+
f"to not conflict with existing event loop: {current_loop}"
|
|
69
|
+
)
|
|
46
70
|
|
|
71
|
+
def wrapped():
|
|
72
|
+
return asyncio.run(fn(*args, **kwargs))
|
|
47
73
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
"""
|
|
51
|
-
Class that encapsulates logic to run during a single pipeline step
|
|
52
|
-
"""
|
|
74
|
+
future = thread_pool.submit(wrapped)
|
|
75
|
+
return future.result()
|
|
53
76
|
|
|
54
|
-
|
|
77
|
+
def process_multiprocess(self, iterable: iterable_input) -> Any:
|
|
78
|
+
logger.info("processing content across processes")
|
|
55
79
|
|
|
56
|
-
|
|
57
|
-
|
|
80
|
+
if iterable:
|
|
81
|
+
if len(iterable) == 1:
|
|
82
|
+
return self.process_serially(iterable)
|
|
83
|
+
if self.context.num_processes == 1:
|
|
84
|
+
return self.process_serially(iterable)
|
|
85
|
+
with mp.Pool(
|
|
86
|
+
processes=self.context.num_processes,
|
|
87
|
+
initializer=self._init_mp,
|
|
88
|
+
initargs=(
|
|
89
|
+
logging.DEBUG if self.context.verbose else logging.INFO,
|
|
90
|
+
self.context.otel_endpoint,
|
|
91
|
+
),
|
|
92
|
+
) as pool:
|
|
93
|
+
otel_context = OtelHandler.inject_context()
|
|
94
|
+
for iter in iterable:
|
|
95
|
+
iter[OtelHandler.trace_context_key] = otel_context
|
|
96
|
+
if self.context.tqdm:
|
|
97
|
+
return list(
|
|
98
|
+
tqdm(
|
|
99
|
+
pool.imap_unordered(func=self._wrap_mp, iterable=iterable),
|
|
100
|
+
total=len(iterable),
|
|
101
|
+
desc=self.identifier,
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
return pool.map(self._wrap_mp, iterable)
|
|
105
|
+
return [self.run()]
|
|
106
|
+
|
|
107
|
+
def _wrap_mp(self, input_kwargs: dict) -> Any:
|
|
108
|
+
# Allow mapping of kwargs via multiprocessing map()
|
|
109
|
+
return self.run(**input_kwargs)
|
|
110
|
+
|
|
111
|
+
def _init_mp(self, log_level: int, endpoint: Optional[str] = None) -> None:
|
|
112
|
+
# Init logger for each spawned process when using multiprocessing pool
|
|
113
|
+
make_default_logger(level=log_level)
|
|
114
|
+
otel_handler = OtelHandler(otel_endpoint=endpoint, log_out=logger.debug)
|
|
115
|
+
otel_handler.init_trace()
|
|
116
|
+
|
|
117
|
+
@instrument()
|
|
118
|
+
def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
|
|
119
|
+
iterable = iterable or []
|
|
58
120
|
if iterable:
|
|
59
121
|
logger.info(
|
|
60
122
|
f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
61
123
|
)
|
|
62
|
-
|
|
63
|
-
self.initialize()
|
|
64
|
-
if not self.supported_multiprocessing():
|
|
65
|
-
if iterable:
|
|
66
|
-
self.result = self.run(iterable)
|
|
67
|
-
else:
|
|
68
|
-
self.result = self.run()
|
|
69
|
-
elif self.pipeline_context.num_processes == 1:
|
|
70
|
-
if iterable:
|
|
71
|
-
self.result = [self.run(it) for it in iterable]
|
|
72
|
-
else:
|
|
73
|
-
self.result = self.run()
|
|
74
124
|
else:
|
|
75
|
-
with
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
"""A pipeline node representing logic to pull data from a source using base ingest documents.
|
|
130
|
-
|
|
131
|
-
This class encapsulates the logic for pulling data from a specified source using base ingest
|
|
132
|
-
documents. The output of this logic is expected to be in JSON format representing the data
|
|
133
|
-
itself.
|
|
134
|
-
|
|
135
|
-
Attributes:
|
|
136
|
-
read_config: A configuration object specifying how to read data from the source.
|
|
137
|
-
retry_strategy_config: Optional configuration specifying the strategy for network errors.
|
|
138
|
-
|
|
139
|
-
Properties:
|
|
140
|
-
retry_strategy: A retry handler configured based on the retry strategy configuration.
|
|
141
|
-
|
|
142
|
-
Methods:
|
|
143
|
-
initialize: Initializes the source node and logs the process.
|
|
144
|
-
run: Abstract method for downloading data associated with ingest documents.
|
|
145
|
-
"""
|
|
146
|
-
|
|
147
|
-
read_config: ReadConfig
|
|
148
|
-
retry_strategy_config: t.Optional[RetryStrategyConfig] = None
|
|
125
|
+
logger.info(f"calling {self.__class__.__name__} with no inputs")
|
|
126
|
+
if self.context.async_supported and self.process.is_async():
|
|
127
|
+
return self.process_async(iterable=iterable)
|
|
128
|
+
if self.context.mp_supported:
|
|
129
|
+
return self.process_multiprocess(iterable=iterable)
|
|
130
|
+
return self.process_serially(iterable=iterable)
|
|
131
|
+
|
|
132
|
+
def _run(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
|
|
133
|
+
return self.asyncio_run(fn=self.run_async, _fn=fn, **kwargs)
|
|
134
|
+
|
|
135
|
+
async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
|
|
136
|
+
raise NotImplementedError
|
|
137
|
+
|
|
138
|
+
def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
|
|
139
|
+
kwargs = kwargs.copy()
|
|
140
|
+
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
|
|
141
|
+
tracer = otel_handler.get_tracer()
|
|
142
|
+
if trace_context := kwargs.pop(otel_handler.trace_context_key, {}):
|
|
143
|
+
otel_handler.attach_context(trace_context=trace_context)
|
|
144
|
+
attributes = {}
|
|
145
|
+
if file_data_path := kwargs.get("file_data_path"):
|
|
146
|
+
attributes["file_id"] = Path(file_data_path).stem
|
|
147
|
+
try:
|
|
148
|
+
with tracer.start_as_current_span(self.identifier, record_exception=True) as span:
|
|
149
|
+
otel_handler.set_attributes(span, attributes)
|
|
150
|
+
fn = _fn or self.process.run
|
|
151
|
+
return self._run(fn=fn, **kwargs)
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
154
|
+
if "file_data_path" in kwargs:
|
|
155
|
+
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
156
|
+
if self.context.raise_on_error:
|
|
157
|
+
raise e
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
|
|
161
|
+
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
|
|
162
|
+
try:
|
|
163
|
+
attributes = {}
|
|
164
|
+
if file_data_path := kwargs.get("file_data_path"):
|
|
165
|
+
attributes["file_id"] = Path(file_data_path).stem
|
|
166
|
+
with otel_handler.get_tracer().start_as_current_span(
|
|
167
|
+
self.identifier, record_exception=True
|
|
168
|
+
) as span:
|
|
169
|
+
otel_handler.set_attributes(span, attributes)
|
|
170
|
+
fn = _fn or self.process.run_async
|
|
171
|
+
return await self._run_async(fn=fn, **kwargs)
|
|
172
|
+
except Exception as e:
|
|
173
|
+
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
174
|
+
if "file_data_path" in kwargs:
|
|
175
|
+
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
176
|
+
if self.context.raise_on_error:
|
|
177
|
+
raise e
|
|
178
|
+
return None
|
|
149
179
|
|
|
150
180
|
@property
|
|
151
|
-
def
|
|
152
|
-
|
|
153
|
-
import backoff
|
|
181
|
+
def cache_dir(self) -> Path:
|
|
182
|
+
return Path(self.context.work_dir) / self.identifier
|
|
154
183
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
max_time=retry_strategy_config.max_retry_time,
|
|
161
|
-
max_tries=retry_strategy_config.max_retries,
|
|
162
|
-
logger=logger,
|
|
163
|
-
start_log_level=logger.level,
|
|
164
|
-
backoff_log_level=logger.level,
|
|
165
|
-
)
|
|
166
|
-
return None
|
|
167
|
-
|
|
168
|
-
def initialize(self):
|
|
169
|
-
logger.info("Running source node to download data associated with ingest docs")
|
|
170
|
-
super().initialize()
|
|
171
|
-
|
|
172
|
-
@abstractmethod
|
|
173
|
-
def run(self, ingest_doc_json: str) -> t.Optional[str]:
|
|
174
|
-
pass
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
@dataclass
|
|
178
|
-
class PartitionNode(PipelineNode):
|
|
179
|
-
"""
|
|
180
|
-
Encapsulates logic to run partition on the json files as the output of the source node
|
|
181
|
-
"""
|
|
182
|
-
|
|
183
|
-
partition_config: PartitionConfig
|
|
184
|
-
partition_kwargs: dict = field(default_factory=dict)
|
|
185
|
-
|
|
186
|
-
def initialize(self):
|
|
187
|
-
logger.info(
|
|
188
|
-
f"Running partition node to extract content from json files. "
|
|
189
|
-
f"Config: {self.partition_config.to_json()}, "
|
|
190
|
-
f"partition kwargs: {json.dumps(self.partition_kwargs)}]",
|
|
191
|
-
)
|
|
192
|
-
super().initialize()
|
|
193
|
-
|
|
194
|
-
def create_hash(self) -> str:
|
|
195
|
-
hash_dict = self.partition_config.to_dict()
|
|
196
|
-
hash_dict["partition_kwargs"] = self.partition_kwargs
|
|
197
|
-
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
|
198
|
-
|
|
199
|
-
@abstractmethod
|
|
200
|
-
def run(self, json_path: str) -> t.Optional[str]:
|
|
201
|
-
pass
|
|
202
|
-
|
|
203
|
-
def get_path(self) -> Path:
|
|
204
|
-
return (Path(self.pipeline_context.work_dir) / "partitioned").resolve()
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
@dataclass
|
|
208
|
-
class ReformatNode(PipelineNode, ABC):
|
|
209
|
-
"""
|
|
210
|
-
Encapsulated any logic to reformat the output List[Element]
|
|
211
|
-
content from partition before writing it
|
|
212
|
-
"""
|
|
213
|
-
|
|
214
|
-
@abstractmethod
|
|
215
|
-
def run(self, elements_json: str) -> t.Optional[str]:
|
|
216
|
-
pass
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
@dataclass
|
|
220
|
-
class WriteNode(PipelineNode):
|
|
221
|
-
"""
|
|
222
|
-
Encapsulated logic to write the final result to a downstream data connection
|
|
223
|
-
"""
|
|
224
|
-
|
|
225
|
-
dest_doc_connector: BaseDestinationConnector
|
|
226
|
-
|
|
227
|
-
@abstractmethod
|
|
228
|
-
def run(self, json_paths: t.List[str]):
|
|
229
|
-
pass
|
|
230
|
-
|
|
231
|
-
def initialize(self):
|
|
232
|
-
logger.info(
|
|
233
|
-
f"Running write node to upload content. "
|
|
234
|
-
f"Destination connector: {self.dest_doc_connector.to_json(redact_sensitive=True)}]",
|
|
235
|
-
)
|
|
236
|
-
super().initialize()
|
|
237
|
-
self.dest_doc_connector.initialize()
|
|
238
|
-
|
|
239
|
-
def supported_multiprocessing(self) -> bool:
|
|
240
|
-
return False
|
|
184
|
+
def delete_cache(self):
|
|
185
|
+
if self.context.iter_delete and self.cache_dir.exists():
|
|
186
|
+
cache_dir = self.cache_dir
|
|
187
|
+
logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
|
|
188
|
+
shutil.rmtree(cache_dir)
|
|
241
189
|
|
|
242
190
|
|
|
243
191
|
@dataclass
|
|
244
|
-
class
|
|
245
|
-
|
|
246
|
-
Encapsulated logic to copy the final result of the pipeline to the designated output location.
|
|
247
|
-
"""
|
|
192
|
+
class BatchPipelineStep(PipelineStep, ABC):
|
|
193
|
+
process: Uploader
|
|
248
194
|
|
|
249
|
-
def
|
|
250
|
-
|
|
251
|
-
|
|
195
|
+
def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
|
|
196
|
+
if self.context.mp_supported and self.process.is_batch():
|
|
197
|
+
return self.run_batch(contents=iterable)
|
|
198
|
+
super().__call__(iterable=iterable)
|
|
252
199
|
|
|
253
200
|
@abstractmethod
|
|
254
|
-
def
|
|
201
|
+
def _run_batch(self, contents: iterable_input, **kwargs) -> Any:
|
|
255
202
|
pass
|
|
256
203
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
logger.info("Running permissions node to cleanup the permissions folder")
|
|
266
|
-
super().initialize()
|
|
267
|
-
|
|
268
|
-
@abstractmethod
|
|
269
|
-
def run(self):
|
|
270
|
-
pass
|
|
204
|
+
def run_batch(self, contents: iterable_input, **kwargs) -> Any:
|
|
205
|
+
try:
|
|
206
|
+
return self._run_batch(contents=contents, **kwargs)
|
|
207
|
+
except Exception as e:
|
|
208
|
+
self.context.status[self.identifier] = {"step_error": str(e)}
|
|
209
|
+
if self.context.raise_on_error:
|
|
210
|
+
raise e
|
|
211
|
+
return None
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from functools import wraps
|
|
2
2
|
from typing import Callable, Optional
|
|
3
3
|
|
|
4
|
-
from unstructured_ingest.
|
|
5
|
-
from unstructured_ingest.
|
|
4
|
+
from unstructured_ingest.logger import logger
|
|
5
|
+
from unstructured_ingest.otel import OtelHandler
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def instrument(
|