unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +1 -1
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +55 -27
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/METADATA +98 -97
- unstructured_ingest-0.7.1.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,852 +0,0 @@
|
|
|
1
|
-
"""Defines Abstract Base Classes (ABC's) core to batch processing documents
|
|
2
|
-
through Unstructured."""
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import functools
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
import re
|
|
10
|
-
from abc import ABC, abstractmethod
|
|
11
|
-
from dataclasses import InitVar, dataclass, field
|
|
12
|
-
from datetime import datetime
|
|
13
|
-
from pathlib import Path
|
|
14
|
-
from typing import TYPE_CHECKING, Any, Optional, Type, TypeVar
|
|
15
|
-
|
|
16
|
-
from dataclasses_json import DataClassJsonMixin
|
|
17
|
-
from dataclasses_json.core import Json, _decode_dataclass
|
|
18
|
-
|
|
19
|
-
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
20
|
-
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
21
|
-
from unstructured_ingest.error import PartitionError, SourceConnectionError
|
|
22
|
-
from unstructured_ingest.logger import logger
|
|
23
|
-
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
24
|
-
from unstructured_ingest.v2.unstructured_api import call_api
|
|
25
|
-
|
|
26
|
-
if TYPE_CHECKING:
|
|
27
|
-
from unstructured.documents.elements import Element
|
|
28
|
-
|
|
29
|
-
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
|
|
30
|
-
|
|
31
|
-
A = TypeVar("A", bound="DataClassJsonMixin")
|
|
32
|
-
|
|
33
|
-
# -- Needed to resolve TypeError raised by using InitVar and __future__.annotations
|
|
34
|
-
# -- See more here: https://stackoverflow.com/questions/70400639/
|
|
35
|
-
InitVar.__call__ = lambda *args: None # type: ignore
|
|
36
|
-
|
|
37
|
-
SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
|
|
38
|
-
"s3",
|
|
39
|
-
"s3a",
|
|
40
|
-
"abfs",
|
|
41
|
-
"az",
|
|
42
|
-
"gs",
|
|
43
|
-
"gcs",
|
|
44
|
-
"box",
|
|
45
|
-
"dropbox",
|
|
46
|
-
"sftp",
|
|
47
|
-
]
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@dataclass
|
|
51
|
-
class BaseSessionHandle(ABC):
|
|
52
|
-
"""Abstract Base Class for sharing resources that are local to an individual process.
|
|
53
|
-
e.g., a connection for making a request for fetching documents."""
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
@dataclass
|
|
57
|
-
class BaseConfig(EnhancedDataClassJsonMixin, ABC):
|
|
58
|
-
pass
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
@dataclass
|
|
62
|
-
class AccessConfig(BaseConfig):
|
|
63
|
-
"""Meant to designate holding any sensitive information associated with other configs
|
|
64
|
-
and also for access specific configs."""
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
@dataclass
|
|
68
|
-
class RetryStrategyConfig(BaseConfig):
|
|
69
|
-
"""
|
|
70
|
-
Contains all info needed for decorator to pull from `self` for backoff
|
|
71
|
-
and retry triggered by exception.
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
max_retries: The maximum number of attempts to make before giving
|
|
75
|
-
up. Once exhausted, the exception will be allowed to escape.
|
|
76
|
-
The default value of None means there is no limit to the
|
|
77
|
-
number of tries. If a callable is passed, it will be
|
|
78
|
-
evaluated at runtime and its return value used.
|
|
79
|
-
max_retry_time: The maximum total amount of time to try for before
|
|
80
|
-
giving up. Once expired, the exception will be allowed to
|
|
81
|
-
escape. If a callable is passed, it will be
|
|
82
|
-
evaluated at runtime and its return value used.
|
|
83
|
-
"""
|
|
84
|
-
|
|
85
|
-
max_retries: Optional[int] = None
|
|
86
|
-
max_retry_time: Optional[float] = None
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
@dataclass
|
|
90
|
-
class PartitionConfig(BaseConfig):
|
|
91
|
-
# where to write structured data outputs
|
|
92
|
-
pdf_infer_table_structure: bool = False
|
|
93
|
-
strategy: str = "auto"
|
|
94
|
-
ocr_languages: Optional[list[str]] = None
|
|
95
|
-
encoding: Optional[str] = None
|
|
96
|
-
additional_partition_args: dict[str, Any] = field(default_factory=dict)
|
|
97
|
-
skip_infer_table_types: Optional[list[str]] = None
|
|
98
|
-
fields_include: list[str] = field(
|
|
99
|
-
default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"],
|
|
100
|
-
)
|
|
101
|
-
flatten_metadata: bool = False
|
|
102
|
-
metadata_exclude: list[str] = field(default_factory=list)
|
|
103
|
-
metadata_include: list[str] = field(default_factory=list)
|
|
104
|
-
partition_endpoint: Optional[str] = "https://api.unstructuredapp.io/general/v0/general"
|
|
105
|
-
partition_by_api: bool = False
|
|
106
|
-
api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None
|
|
107
|
-
hi_res_model_name: Optional[str] = None
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
@dataclass
|
|
111
|
-
class ProcessorConfig(BaseConfig):
|
|
112
|
-
reprocess: bool = False
|
|
113
|
-
verbose: bool = False
|
|
114
|
-
work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
|
|
115
|
-
output_dir: str = "structured-output"
|
|
116
|
-
num_processes: int = 2
|
|
117
|
-
raise_on_error: bool = False
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
@dataclass
|
|
121
|
-
class FileStorageConfig(BaseConfig):
|
|
122
|
-
remote_url: str
|
|
123
|
-
uncompress: bool = False
|
|
124
|
-
recursive: bool = False
|
|
125
|
-
file_glob: Optional[list[str]] = None
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
@dataclass
|
|
129
|
-
class FsspecConfig(FileStorageConfig):
|
|
130
|
-
access_config: Optional[AccessConfig] = None
|
|
131
|
-
protocol: str = field(init=False)
|
|
132
|
-
path_without_protocol: str = field(init=False)
|
|
133
|
-
dir_path: str = field(init=False)
|
|
134
|
-
file_path: str = field(init=False)
|
|
135
|
-
|
|
136
|
-
def get_access_config(self) -> dict[str, Any]:
|
|
137
|
-
if self.access_config:
|
|
138
|
-
return self.access_config.to_dict(apply_name_overload=False)
|
|
139
|
-
else:
|
|
140
|
-
return {}
|
|
141
|
-
|
|
142
|
-
def __post_init__(self):
|
|
143
|
-
self.protocol, self.path_without_protocol = self.remote_url.split("://")
|
|
144
|
-
if self.protocol not in SUPPORTED_REMOTE_FSSPEC_PROTOCOLS:
|
|
145
|
-
raise ValueError(
|
|
146
|
-
f"Protocol {self.protocol} not supported yet, only "
|
|
147
|
-
f"{SUPPORTED_REMOTE_FSSPEC_PROTOCOLS} are supported.",
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
# dropbox root is an empty string
|
|
151
|
-
match = re.match(rf"{self.protocol}://([\s])/", self.remote_url)
|
|
152
|
-
if match and self.protocol == "dropbox":
|
|
153
|
-
self.dir_path = " "
|
|
154
|
-
self.file_path = ""
|
|
155
|
-
return
|
|
156
|
-
|
|
157
|
-
# dropbox paths can start with slash
|
|
158
|
-
match = re.match(rf"{self.protocol}:///([^/\s]+?)/([^\s]*)", self.remote_url)
|
|
159
|
-
if match and self.protocol == "dropbox":
|
|
160
|
-
self.dir_path = match.group(1)
|
|
161
|
-
self.file_path = match.group(2) or ""
|
|
162
|
-
return
|
|
163
|
-
|
|
164
|
-
# just a path with no trailing prefix
|
|
165
|
-
match = re.match(rf"{self.protocol}://([^/\s]+?)(/*)$", self.remote_url)
|
|
166
|
-
if match:
|
|
167
|
-
self.dir_path = match.group(1)
|
|
168
|
-
self.file_path = ""
|
|
169
|
-
return
|
|
170
|
-
|
|
171
|
-
# valid path with a dir and/or file
|
|
172
|
-
match = re.match(rf"{self.protocol}://([^/\s]+?)/([^\s]*)", self.remote_url)
|
|
173
|
-
if not match:
|
|
174
|
-
raise ValueError(
|
|
175
|
-
f"Invalid path {self.remote_url}. "
|
|
176
|
-
f"Expected <protocol>://<dir-path>/<file-or-dir-path>.",
|
|
177
|
-
)
|
|
178
|
-
self.dir_path = match.group(1)
|
|
179
|
-
self.file_path = match.group(2) or ""
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
@dataclass
|
|
183
|
-
class ReadConfig(BaseConfig):
|
|
184
|
-
# where raw documents are stored for processing, and then removed if not preserve_downloads
|
|
185
|
-
download_dir: Optional[str] = ""
|
|
186
|
-
re_download: bool = False
|
|
187
|
-
preserve_downloads: bool = False
|
|
188
|
-
download_only: bool = False
|
|
189
|
-
max_docs: Optional[int] = None
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
@dataclass
|
|
193
|
-
class EmbeddingConfig(BaseConfig):
|
|
194
|
-
provider: str
|
|
195
|
-
api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None
|
|
196
|
-
model_name: Optional[str] = None
|
|
197
|
-
aws_access_key_id: Optional[str] = None
|
|
198
|
-
aws_secret_access_key: Optional[str] = None
|
|
199
|
-
aws_region: Optional[str] = None
|
|
200
|
-
|
|
201
|
-
def get_embedder(self) -> "BaseEmbeddingEncoder":
|
|
202
|
-
kwargs: dict[str, Any] = {}
|
|
203
|
-
if self.api_key:
|
|
204
|
-
kwargs["api_key"] = self.api_key
|
|
205
|
-
if self.model_name:
|
|
206
|
-
kwargs["model_name"] = self.model_name
|
|
207
|
-
# TODO make this more dynamic to map to encoder configs
|
|
208
|
-
if self.provider == "openai":
|
|
209
|
-
from unstructured_ingest.embed.openai import (
|
|
210
|
-
OpenAIEmbeddingConfig,
|
|
211
|
-
OpenAIEmbeddingEncoder,
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
|
|
215
|
-
elif self.provider == "huggingface":
|
|
216
|
-
from unstructured_ingest.embed.huggingface import (
|
|
217
|
-
HuggingFaceEmbeddingConfig,
|
|
218
|
-
HuggingFaceEmbeddingEncoder,
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
|
|
222
|
-
elif self.provider == "octoai":
|
|
223
|
-
from unstructured_ingest.embed.octoai import (
|
|
224
|
-
OctoAiEmbeddingConfig,
|
|
225
|
-
OctoAIEmbeddingEncoder,
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
|
|
229
|
-
elif self.provider == "bedrock":
|
|
230
|
-
from unstructured_ingest.embed.bedrock import (
|
|
231
|
-
BedrockEmbeddingConfig,
|
|
232
|
-
BedrockEmbeddingEncoder,
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
return BedrockEmbeddingEncoder(
|
|
236
|
-
config=BedrockEmbeddingConfig(
|
|
237
|
-
aws_access_key_id=self.aws_access_key_id,
|
|
238
|
-
aws_secret_access_key=self.aws_secret_access_key,
|
|
239
|
-
region_name=self.aws_region,
|
|
240
|
-
)
|
|
241
|
-
)
|
|
242
|
-
elif self.provider == "vertexai":
|
|
243
|
-
from unstructured_ingest.embed.vertexai import (
|
|
244
|
-
VertexAIEmbeddingConfig,
|
|
245
|
-
VertexAIEmbeddingEncoder,
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
|
|
249
|
-
elif self.provider == "voyageai":
|
|
250
|
-
from unstructured_ingest.embed.voyageai import (
|
|
251
|
-
VoyageAIEmbeddingConfig,
|
|
252
|
-
VoyageAIEmbeddingEncoder,
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**kwargs))
|
|
256
|
-
else:
|
|
257
|
-
raise ValueError(f"{self.provider} not a recognized encoder")
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
@dataclass
|
|
261
|
-
class ChunkingConfig(BaseConfig):
|
|
262
|
-
chunk_elements: InitVar[bool] = False
|
|
263
|
-
chunking_strategy: Optional[str] = None
|
|
264
|
-
combine_text_under_n_chars: Optional[int] = None
|
|
265
|
-
include_orig_elements: Optional[bool] = None
|
|
266
|
-
max_characters: Optional[int] = None
|
|
267
|
-
multipage_sections: Optional[bool] = None
|
|
268
|
-
new_after_n_chars: Optional[int] = None
|
|
269
|
-
overlap: Optional[int] = None
|
|
270
|
-
overlap_all: Optional[bool] = None
|
|
271
|
-
|
|
272
|
-
def __post_init__(self, chunk_elements: bool) -> None:
|
|
273
|
-
"""Resolve chunking_strategy if chunk_elements is True.
|
|
274
|
-
|
|
275
|
-
If chunk_elements is True and chunking_strategy is None, default to 'by_title'. Otherwise,
|
|
276
|
-
do nothing and keep the defined value of chunking_strategy."
|
|
277
|
-
"""
|
|
278
|
-
if chunk_elements and self.chunking_strategy is None:
|
|
279
|
-
self.chunking_strategy = "by_title"
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
@dataclass
|
|
283
|
-
class PermissionsConfig(BaseConfig):
|
|
284
|
-
application_id: Optional[str] = enhanced_field(overload_name="permissions_application_id")
|
|
285
|
-
tenant: Optional[str] = enhanced_field(overload_name="permissions_tenant")
|
|
286
|
-
client_cred: Optional[str] = enhanced_field(
|
|
287
|
-
default=None, sensitive=True, overload_name="permissions_client_cred"
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
# module-level variable to store session handle
|
|
292
|
-
global_write_session_handle: Optional[BaseSessionHandle] = None
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
@dataclass
|
|
296
|
-
class WriteConfig(BaseConfig):
|
|
297
|
-
pass
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
@dataclass
|
|
301
|
-
class BaseConnectorConfig(BaseConfig, ABC):
|
|
302
|
-
"""Abstract definition on which to define connector-specific attributes."""
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
@dataclass
|
|
306
|
-
class SourceMetadata(EnhancedDataClassJsonMixin, ABC):
|
|
307
|
-
date_created: Optional[str] = None
|
|
308
|
-
date_modified: Optional[str] = None
|
|
309
|
-
version: Optional[str] = None
|
|
310
|
-
source_url: Optional[str] = None
|
|
311
|
-
exists: Optional[bool] = None
|
|
312
|
-
permissions_data: Optional[list[dict[str, Any]]] = None
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
class IngestDocJsonMixin(EnhancedDataClassJsonMixin):
|
|
316
|
-
"""
|
|
317
|
-
Inherently, DataClassJsonMixin does not add in any @property fields to the json/dict
|
|
318
|
-
created from the dataclass. This explicitly sets properties to look for on the IngestDoc
|
|
319
|
-
class when creating the json/dict for serialization purposes.
|
|
320
|
-
"""
|
|
321
|
-
|
|
322
|
-
metadata_properties = [
|
|
323
|
-
"date_created",
|
|
324
|
-
"date_modified",
|
|
325
|
-
"date_processed",
|
|
326
|
-
"exists",
|
|
327
|
-
"permissions_data",
|
|
328
|
-
"version",
|
|
329
|
-
"source_url",
|
|
330
|
-
]
|
|
331
|
-
properties_to_serialize = [
|
|
332
|
-
"base_filename",
|
|
333
|
-
"filename",
|
|
334
|
-
"_output_filename",
|
|
335
|
-
"record_locator",
|
|
336
|
-
"_source_metadata",
|
|
337
|
-
"unique_id",
|
|
338
|
-
]
|
|
339
|
-
|
|
340
|
-
def add_props(self, as_dict: dict[str, Any], props: list[str]):
|
|
341
|
-
for prop in props:
|
|
342
|
-
val = getattr(self, prop)
|
|
343
|
-
if isinstance(val, Path):
|
|
344
|
-
val = str(val)
|
|
345
|
-
if isinstance(val, DataClassJsonMixin):
|
|
346
|
-
val = val.to_dict(encode_json=False)
|
|
347
|
-
as_dict[prop] = val
|
|
348
|
-
|
|
349
|
-
def to_dict(self, **kwargs) -> dict[str, Json]:
|
|
350
|
-
as_dict = _asdict(self, **kwargs)
|
|
351
|
-
if "_session_handle" in as_dict:
|
|
352
|
-
as_dict.pop("_session_handle", None)
|
|
353
|
-
self.add_props(as_dict=as_dict, props=self.properties_to_serialize)
|
|
354
|
-
if getattr(self, "_source_metadata") is not None:
|
|
355
|
-
self.add_props(as_dict=as_dict, props=self.metadata_properties)
|
|
356
|
-
return as_dict
|
|
357
|
-
|
|
358
|
-
@classmethod
|
|
359
|
-
def from_dict(
|
|
360
|
-
cls: Type[A], kvs: Json, *, infer_missing=False, apply_name_overload: bool = True
|
|
361
|
-
) -> A:
|
|
362
|
-
doc = super().from_dict(
|
|
363
|
-
kvs=kvs, infer_missing=infer_missing, apply_name_overload=apply_name_overload
|
|
364
|
-
)
|
|
365
|
-
if meta := kvs.get("_source_metadata"):
|
|
366
|
-
setattr(doc, "_source_metadata", SourceMetadata.from_dict(meta))
|
|
367
|
-
if date_processed := kvs.get("_date_processed"):
|
|
368
|
-
setattr(doc, "_date_processed", date_processed)
|
|
369
|
-
return doc
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
class BatchIngestDocJsonMixin(EnhancedDataClassJsonMixin):
|
|
373
|
-
"""
|
|
374
|
-
Inherently, DataClassJsonMixin does not add in any @property fields to the json/dict
|
|
375
|
-
created from the dataclass. This explicitly sets properties to look for on the IngestDoc
|
|
376
|
-
class when creating the json/dict for serialization purposes.
|
|
377
|
-
"""
|
|
378
|
-
|
|
379
|
-
properties_to_serialize = ["unique_id"]
|
|
380
|
-
|
|
381
|
-
def add_props(self, as_dict: dict[str, Any], props: list[str]):
|
|
382
|
-
for prop in props:
|
|
383
|
-
val = getattr(self, prop)
|
|
384
|
-
if isinstance(val, Path):
|
|
385
|
-
val = str(val)
|
|
386
|
-
if isinstance(val, DataClassJsonMixin):
|
|
387
|
-
val = val.to_dict(encode_json=False)
|
|
388
|
-
as_dict[prop] = val
|
|
389
|
-
|
|
390
|
-
def to_dict(self, encode_json=False) -> dict[str, Json]:
|
|
391
|
-
as_dict = _asdict(self, encode_json=encode_json)
|
|
392
|
-
self.add_props(as_dict=as_dict, props=self.properties_to_serialize)
|
|
393
|
-
return as_dict
|
|
394
|
-
|
|
395
|
-
@classmethod
|
|
396
|
-
def from_dict(cls: Type[A], kvs: Json, *, infer_missing=False) -> A:
|
|
397
|
-
doc = _decode_dataclass(cls, kvs, infer_missing)
|
|
398
|
-
return doc
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
@dataclass
|
|
402
|
-
class BaseIngestDoc(ABC):
|
|
403
|
-
processor_config: ProcessorConfig
|
|
404
|
-
read_config: ReadConfig
|
|
405
|
-
connector_config: BaseConnectorConfig
|
|
406
|
-
|
|
407
|
-
@property
|
|
408
|
-
@abstractmethod
|
|
409
|
-
def unique_id(self) -> str:
|
|
410
|
-
pass
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
@dataclass
|
|
414
|
-
class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
415
|
-
"""An "ingest document" is specific to a connector, and provides
|
|
416
|
-
methods to fetch a single raw document, store it locally for processing, any cleanup
|
|
417
|
-
needed after successful processing of the doc, and the ability to write the doc's
|
|
418
|
-
structured outputs once processed.
|
|
419
|
-
|
|
420
|
-
Crucially, it is not responsible for the actual processing of the raw document.
|
|
421
|
-
"""
|
|
422
|
-
|
|
423
|
-
_source_metadata: Optional[SourceMetadata] = field(init=False, default=None)
|
|
424
|
-
_date_processed: Optional[str] = field(init=False, default=None)
|
|
425
|
-
|
|
426
|
-
@property
|
|
427
|
-
def source_metadata(self) -> SourceMetadata:
|
|
428
|
-
if self._source_metadata is None:
|
|
429
|
-
self.update_source_metadata()
|
|
430
|
-
# Provide guarantee that the field was set by update_source_metadata()
|
|
431
|
-
if self._source_metadata is None:
|
|
432
|
-
raise ValueError("failed to set source metadata")
|
|
433
|
-
return self._source_metadata
|
|
434
|
-
|
|
435
|
-
@source_metadata.setter
|
|
436
|
-
def source_metadata(self, value: SourceMetadata):
|
|
437
|
-
self._source_metadata = value
|
|
438
|
-
|
|
439
|
-
@property
|
|
440
|
-
def date_created(self) -> Optional[str]:
|
|
441
|
-
"""The date the document was created on the source system."""
|
|
442
|
-
return self.source_metadata.date_created
|
|
443
|
-
|
|
444
|
-
@property
|
|
445
|
-
def date_modified(self) -> Optional[str]:
|
|
446
|
-
"""The date the document was last modified on the source system."""
|
|
447
|
-
return self.source_metadata.date_modified
|
|
448
|
-
|
|
449
|
-
@property
|
|
450
|
-
def date_processed(self) -> Optional[str]:
|
|
451
|
-
"""The date the document was last processed by Unstructured.
|
|
452
|
-
self._date_processed is assigned internally in self.partition_file()"""
|
|
453
|
-
return self._date_processed
|
|
454
|
-
|
|
455
|
-
@property
|
|
456
|
-
def exists(self) -> Optional[bool]:
|
|
457
|
-
"""Whether the document exists on the remote source."""
|
|
458
|
-
return self.source_metadata.exists
|
|
459
|
-
|
|
460
|
-
@property
|
|
461
|
-
@abstractmethod
|
|
462
|
-
def filename(self):
|
|
463
|
-
"""The local filename of the document after fetching from remote source."""
|
|
464
|
-
|
|
465
|
-
@property
|
|
466
|
-
def base_filename(self) -> Optional[str]:
|
|
467
|
-
if self.read_config.download_dir and self.filename:
|
|
468
|
-
download_path = str(Path(self.read_config.download_dir).resolve())
|
|
469
|
-
full_path = str(self.filename)
|
|
470
|
-
base_path = full_path.replace(download_path, "")
|
|
471
|
-
return base_path
|
|
472
|
-
return None
|
|
473
|
-
|
|
474
|
-
@property
|
|
475
|
-
def base_output_filename(self) -> Optional[str]:
|
|
476
|
-
if self.processor_config.output_dir and self._output_filename:
|
|
477
|
-
output_path = str(Path(self.processor_config.output_dir).resolve())
|
|
478
|
-
full_path = str(self._output_filename)
|
|
479
|
-
base_path = full_path.replace(output_path, "")
|
|
480
|
-
return base_path
|
|
481
|
-
return None
|
|
482
|
-
|
|
483
|
-
@property
|
|
484
|
-
@abstractmethod
|
|
485
|
-
def _output_filename(self):
|
|
486
|
-
"""Filename of the structured output for this doc."""
|
|
487
|
-
|
|
488
|
-
@property
|
|
489
|
-
def record_locator(self) -> Optional[dict[str, Any]]: # Values must be JSON-serializable
|
|
490
|
-
"""A dictionary with any data necessary to uniquely identify the document on
|
|
491
|
-
the source system."""
|
|
492
|
-
return None
|
|
493
|
-
|
|
494
|
-
@property
|
|
495
|
-
def unique_id(self) -> str:
|
|
496
|
-
return self.filename
|
|
497
|
-
|
|
498
|
-
@property
|
|
499
|
-
def source_url(self) -> Optional[str]:
|
|
500
|
-
"""The url of the source document."""
|
|
501
|
-
return self.source_metadata.source_url # type: ignore
|
|
502
|
-
|
|
503
|
-
@property
|
|
504
|
-
def version(self) -> Optional[str]:
|
|
505
|
-
"""The version of the source document, this could be the last modified date, an
|
|
506
|
-
explicit version number, or anything else that can be used to uniquely identify
|
|
507
|
-
the version of the document."""
|
|
508
|
-
return self.source_metadata.version # type: ignore
|
|
509
|
-
|
|
510
|
-
@property
|
|
511
|
-
def permissions_data(self) -> Optional[list[dict[str, Any]]]:
|
|
512
|
-
"""Access control data, aka permissions or sharing, from the source system."""
|
|
513
|
-
if self.source_metadata is None:
|
|
514
|
-
self.update_source_metadata()
|
|
515
|
-
return self.source_metadata.permissions_data # type: ignore
|
|
516
|
-
|
|
517
|
-
@abstractmethod
|
|
518
|
-
def cleanup_file(self):
|
|
519
|
-
"""Removes the local copy the file (or anything else) after successful processing."""
|
|
520
|
-
|
|
521
|
-
@staticmethod
|
|
522
|
-
def skip_if_file_exists(func):
|
|
523
|
-
"""Decorator that checks if a file exists, is not empty, and should not re-download,
|
|
524
|
-
if so log a message indicating as much and skip the decorated function."""
|
|
525
|
-
|
|
526
|
-
@functools.wraps(func)
|
|
527
|
-
def wrapper(self, *args, **kwargs):
|
|
528
|
-
if (
|
|
529
|
-
not self.read_config.re_download
|
|
530
|
-
and self.filename.is_file()
|
|
531
|
-
and self.filename.stat().st_size
|
|
532
|
-
):
|
|
533
|
-
logger.debug(f"file exists: {self.filename}, skipping {func.__name__}")
|
|
534
|
-
return None
|
|
535
|
-
return func(self, *args, **kwargs)
|
|
536
|
-
|
|
537
|
-
return wrapper
|
|
538
|
-
|
|
539
|
-
# TODO: set as @abstractmethod and pass or raise NotImplementedError
|
|
540
|
-
def update_source_metadata(self, **kwargs) -> None:
|
|
541
|
-
"""Sets the SourceMetadata and the properties for the doc"""
|
|
542
|
-
self._source_metadata = SourceMetadata()
|
|
543
|
-
|
|
544
|
-
def update_permissions_data(self):
|
|
545
|
-
"""Sets the _permissions_data property for the doc.
|
|
546
|
-
This property is later used to fill the corresponding SourceMetadata.permissions_data field,
|
|
547
|
-
and after that carries on to the permissions_data property."""
|
|
548
|
-
self._permissions_data: Optional[list[dict[str, Any]]] = None
|
|
549
|
-
|
|
550
|
-
# NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods
|
|
551
|
-
# in addition to or instead of get_file()
|
|
552
|
-
@abstractmethod
|
|
553
|
-
@SourceConnectionError.wrap
|
|
554
|
-
def get_file(self):
|
|
555
|
-
"""Fetches the "remote" doc and stores it locally on the filesystem."""
|
|
556
|
-
|
|
557
|
-
def has_output(self) -> bool:
|
|
558
|
-
"""Determine if structured output for this doc already exists."""
|
|
559
|
-
return self._output_filename.is_file() and self._output_filename.stat().st_size
|
|
560
|
-
|
|
561
|
-
@PartitionError.wrap
|
|
562
|
-
def partition_file(
|
|
563
|
-
self,
|
|
564
|
-
partition_config: PartitionConfig,
|
|
565
|
-
**partition_kwargs,
|
|
566
|
-
) -> list["Element"]:
|
|
567
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
568
|
-
from unstructured.partition.auto import partition
|
|
569
|
-
from unstructured.staging.base import elements_from_dicts
|
|
570
|
-
|
|
571
|
-
if not partition_config.partition_by_api:
|
|
572
|
-
logger.debug("Using local partition")
|
|
573
|
-
elements = partition(
|
|
574
|
-
filename=str(self.filename),
|
|
575
|
-
data_source_metadata=DataSourceMetadata(
|
|
576
|
-
url=self.source_url,
|
|
577
|
-
version=self.version,
|
|
578
|
-
record_locator=self.record_locator,
|
|
579
|
-
date_created=self.date_created,
|
|
580
|
-
date_modified=self.date_modified,
|
|
581
|
-
date_processed=self.date_processed,
|
|
582
|
-
permissions_data=self.permissions_data,
|
|
583
|
-
),
|
|
584
|
-
**partition_kwargs,
|
|
585
|
-
)
|
|
586
|
-
else:
|
|
587
|
-
endpoint = partition_config.partition_endpoint
|
|
588
|
-
|
|
589
|
-
logger.debug(f"using remote partition ({endpoint})")
|
|
590
|
-
elements_dicts = call_api(
|
|
591
|
-
server_url=endpoint,
|
|
592
|
-
api_key=partition_config.api_key,
|
|
593
|
-
filename=Path(self.filename),
|
|
594
|
-
api_parameters=partition_kwargs,
|
|
595
|
-
)
|
|
596
|
-
elements = elements_from_dicts(elements_dicts)
|
|
597
|
-
# TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
|
|
598
|
-
# pass the stringified json here
|
|
599
|
-
return elements
|
|
600
|
-
|
|
601
|
-
def process_file(
|
|
602
|
-
self,
|
|
603
|
-
partition_config: PartitionConfig,
|
|
604
|
-
**partition_kwargs,
|
|
605
|
-
) -> Optional[list[dict[str, Any]]]:
|
|
606
|
-
self._date_processed = datetime.utcnow().isoformat()
|
|
607
|
-
if self.read_config.download_only:
|
|
608
|
-
return None
|
|
609
|
-
logger.info(f"processing {self.filename}")
|
|
610
|
-
|
|
611
|
-
elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
|
|
612
|
-
element_dicts = [e.to_dict() for e in elements]
|
|
613
|
-
|
|
614
|
-
self.isd_elems_no_filename: list[dict[str, Any]] = []
|
|
615
|
-
for elem in element_dicts:
|
|
616
|
-
if partition_config.metadata_exclude and partition_config.metadata_include:
|
|
617
|
-
raise ValueError(
|
|
618
|
-
"Arguments `--metadata-include` and `--metadata-exclude` are "
|
|
619
|
-
"mutually exclusive with each other.",
|
|
620
|
-
)
|
|
621
|
-
elif partition_config.metadata_exclude:
|
|
622
|
-
ex_list = partition_config.metadata_exclude
|
|
623
|
-
for ex in ex_list:
|
|
624
|
-
if "." in ex: # handle nested fields
|
|
625
|
-
nested_fields = ex.split(".")
|
|
626
|
-
current_elem = elem
|
|
627
|
-
for f in nested_fields[:-1]:
|
|
628
|
-
if f in current_elem:
|
|
629
|
-
current_elem = current_elem[f]
|
|
630
|
-
field_to_exclude = nested_fields[-1]
|
|
631
|
-
if field_to_exclude in current_elem:
|
|
632
|
-
current_elem.pop(field_to_exclude, None)
|
|
633
|
-
else: # handle top-level fields
|
|
634
|
-
elem["metadata"].pop(ex, None) # type: ignore[attr-defined]
|
|
635
|
-
elif partition_config.metadata_include:
|
|
636
|
-
in_list = partition_config.metadata_include
|
|
637
|
-
for k in list(elem["metadata"].keys()): # type: ignore[attr-defined]
|
|
638
|
-
if k not in in_list:
|
|
639
|
-
elem["metadata"].pop(k, None) # type: ignore[attr-defined]
|
|
640
|
-
in_list = partition_config.fields_include
|
|
641
|
-
elem = {k: v for k, v in elem.items() if k in in_list}
|
|
642
|
-
|
|
643
|
-
if partition_config.flatten_metadata and "metadata" in elem:
|
|
644
|
-
metadata = elem.pop("metadata")
|
|
645
|
-
elem.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
|
|
646
|
-
|
|
647
|
-
self.isd_elems_no_filename.append(elem)
|
|
648
|
-
|
|
649
|
-
return self.isd_elems_no_filename
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
@dataclass
|
|
653
|
-
class BaseIngestDocBatch(BaseIngestDoc, BatchIngestDocJsonMixin, ABC):
|
|
654
|
-
ingest_docs: list[BaseSingleIngestDoc] = field(default_factory=list)
|
|
655
|
-
|
|
656
|
-
@abstractmethod
|
|
657
|
-
@SourceConnectionError.wrap
|
|
658
|
-
def get_files(self):
|
|
659
|
-
"""Fetches the "remote" docs and stores it locally on the filesystem."""
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
@dataclass
|
|
663
|
-
class BaseConnector(EnhancedDataClassJsonMixin, ABC):
|
|
664
|
-
@abstractmethod
|
|
665
|
-
def check_connection(self):
|
|
666
|
-
pass
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
@dataclass
|
|
670
|
-
class BaseSourceConnector(BaseConnector, ABC):
|
|
671
|
-
"""Abstract Base Class for a connector to a remote source, e.g. S3 or Google Drive."""
|
|
672
|
-
|
|
673
|
-
processor_config: ProcessorConfig
|
|
674
|
-
read_config: ReadConfig
|
|
675
|
-
connector_config: BaseConnectorConfig
|
|
676
|
-
|
|
677
|
-
@abstractmethod
|
|
678
|
-
def cleanup(self, cur_dir=None):
|
|
679
|
-
"""Any additional cleanup up need after processing is complete. E.g., removing
|
|
680
|
-
temporary download dirs that are empty.
|
|
681
|
-
|
|
682
|
-
By convention, documents that failed to process are typically not cleaned up."""
|
|
683
|
-
|
|
684
|
-
@abstractmethod
|
|
685
|
-
def initialize(self):
|
|
686
|
-
"""Initializes the connector. Should also validate the connector is properly
|
|
687
|
-
configured: e.g., list a single a document from the source."""
|
|
688
|
-
|
|
689
|
-
@abstractmethod
|
|
690
|
-
def get_ingest_docs(self):
|
|
691
|
-
"""Returns all ingest docs (derived from BaseIngestDoc).
|
|
692
|
-
This does not imply downloading all the raw documents themselves,
|
|
693
|
-
rather each IngestDoc is capable of fetching its content (in another process)
|
|
694
|
-
with IngestDoc.get_file()."""
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
@dataclass
|
|
698
|
-
class BaseDestinationConnector(BaseConnector, ABC):
|
|
699
|
-
write_config: WriteConfig
|
|
700
|
-
connector_config: BaseConnectorConfig
|
|
701
|
-
|
|
702
|
-
def __init__(self, write_config: WriteConfig, connector_config: BaseConnectorConfig):
|
|
703
|
-
self.write_config = write_config
|
|
704
|
-
self.connector_config = connector_config
|
|
705
|
-
|
|
706
|
-
def conform_dict(self, data: dict[str, Any]) -> None:
|
|
707
|
-
"""
|
|
708
|
-
When the original dictionary needs to be modified in place
|
|
709
|
-
"""
|
|
710
|
-
return
|
|
711
|
-
|
|
712
|
-
def normalize_dict(self, element_dict: dict[str, Any]) -> dict[str, Any]:
|
|
713
|
-
"""
|
|
714
|
-
When the original dictionary needs to be mapped to a new one
|
|
715
|
-
"""
|
|
716
|
-
return element_dict
|
|
717
|
-
|
|
718
|
-
@abstractmethod
|
|
719
|
-
def initialize(self):
|
|
720
|
-
"""Initializes the connector. Should also validate the connector is properly
|
|
721
|
-
configured."""
|
|
722
|
-
|
|
723
|
-
def write(self, docs: list[BaseSingleIngestDoc]) -> None:
|
|
724
|
-
elements_dict = self.get_elements_dict(docs=docs)
|
|
725
|
-
self.modify_and_write_dict(elements_dict=elements_dict)
|
|
726
|
-
|
|
727
|
-
def get_elements_dict(self, docs: list[BaseSingleIngestDoc]) -> list[dict[str, Any]]:
|
|
728
|
-
dict_list: list[dict[str, Any]] = []
|
|
729
|
-
for doc in docs:
|
|
730
|
-
local_path = doc._output_filename
|
|
731
|
-
with open(local_path) as json_file:
|
|
732
|
-
dict_content = json.load(json_file)
|
|
733
|
-
logger.info(
|
|
734
|
-
f"Extending {len(dict_content)} json elements from content in {local_path}",
|
|
735
|
-
)
|
|
736
|
-
dict_list.extend(dict_content)
|
|
737
|
-
return dict_list
|
|
738
|
-
|
|
739
|
-
@abstractmethod
|
|
740
|
-
def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
|
|
741
|
-
pass
|
|
742
|
-
|
|
743
|
-
def modify_and_write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
|
|
744
|
-
"""
|
|
745
|
-
Modify in this instance means this method wraps calls to conform_dict() and
|
|
746
|
-
normalize() before actually processing the content via write_dict()
|
|
747
|
-
"""
|
|
748
|
-
for d in elements_dict:
|
|
749
|
-
self.conform_dict(data=d)
|
|
750
|
-
elements_dict_normalized = [self.normalize_dict(element_dict=d) for d in elements_dict]
|
|
751
|
-
return self.write_dict(*args, elements_dict=elements_dict_normalized, **kwargs)
|
|
752
|
-
|
|
753
|
-
def write_elements(self, elements: list["Element"], *args, **kwargs) -> None:
|
|
754
|
-
elements_dict = [e.to_dict() for e in elements]
|
|
755
|
-
self.modify_and_write_dict(*args, elements_dict=elements_dict, **kwargs)
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
class SourceConnectorCleanupMixin:
|
|
759
|
-
read_config: ReadConfig
|
|
760
|
-
|
|
761
|
-
def cleanup(self, cur_dir=None):
|
|
762
|
-
"""Recursively clean up downloaded files and directories."""
|
|
763
|
-
if self.read_config.preserve_downloads or self.read_config.download_only:
|
|
764
|
-
return
|
|
765
|
-
if cur_dir is None:
|
|
766
|
-
cur_dir = self.read_config.download_dir
|
|
767
|
-
if cur_dir is None or not Path(cur_dir).is_dir():
|
|
768
|
-
return
|
|
769
|
-
sub_dirs = os.listdir(cur_dir)
|
|
770
|
-
os.chdir(cur_dir)
|
|
771
|
-
for sub_dir in sub_dirs:
|
|
772
|
-
# don't traverse symlinks, not that there every should be any
|
|
773
|
-
if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
|
|
774
|
-
self.cleanup(sub_dir)
|
|
775
|
-
os.chdir("..")
|
|
776
|
-
if len(os.listdir(cur_dir)) == 0:
|
|
777
|
-
os.rmdir(cur_dir)
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
class PermissionsCleanupMixin:
|
|
781
|
-
processor_config: ProcessorConfig
|
|
782
|
-
|
|
783
|
-
def cleanup_permissions(self, cur_dir=None):
|
|
784
|
-
def has_no_folders(folder_path):
|
|
785
|
-
folders = [
|
|
786
|
-
item
|
|
787
|
-
for item in os.listdir(folder_path)
|
|
788
|
-
if os.path.isdir(os.path.join(folder_path, item))
|
|
789
|
-
]
|
|
790
|
-
return len(folders) == 0
|
|
791
|
-
|
|
792
|
-
"""Recursively clean up downloaded files and directories."""
|
|
793
|
-
if cur_dir is None:
|
|
794
|
-
cur_dir = Path(self.processor_config.output_dir, "permissions_data")
|
|
795
|
-
if not Path(cur_dir).exists():
|
|
796
|
-
return
|
|
797
|
-
if Path(cur_dir).is_file():
|
|
798
|
-
cur_file = cur_dir
|
|
799
|
-
os.remove(cur_file)
|
|
800
|
-
return
|
|
801
|
-
sub_dirs = os.listdir(cur_dir)
|
|
802
|
-
os.chdir(cur_dir)
|
|
803
|
-
for sub_dir in sub_dirs:
|
|
804
|
-
# don't traverse symlinks, not that there every should be any
|
|
805
|
-
if not os.path.islink(sub_dir):
|
|
806
|
-
self.cleanup_permissions(sub_dir)
|
|
807
|
-
os.chdir("..")
|
|
808
|
-
if has_no_folders(cur_dir):
|
|
809
|
-
os.rmdir(cur_dir)
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
class IngestDocCleanupMixin:
|
|
813
|
-
read_config: ReadConfig
|
|
814
|
-
|
|
815
|
-
@property
|
|
816
|
-
@abstractmethod
|
|
817
|
-
def filename(self):
|
|
818
|
-
"""The local filename of the document after fetching from remote source."""
|
|
819
|
-
|
|
820
|
-
def cleanup_file(self):
|
|
821
|
-
"""Removes the local copy of the file after successful processing."""
|
|
822
|
-
if (
|
|
823
|
-
not self.read_config.preserve_downloads
|
|
824
|
-
and self.filename.is_file()
|
|
825
|
-
and not self.read_config.download_only
|
|
826
|
-
):
|
|
827
|
-
logger.debug(f"cleaning up {self}")
|
|
828
|
-
os.unlink(self.filename)
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
class ConfigSessionHandleMixin:
|
|
832
|
-
@abstractmethod
|
|
833
|
-
def create_session_handle(self) -> BaseSessionHandle:
|
|
834
|
-
"""Creates a session handle that will be assigned on each IngestDoc to share
|
|
835
|
-
session related resources across all document handling for a given subprocess."""
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
@dataclass
|
|
839
|
-
class IngestDocSessionHandleMixin:
|
|
840
|
-
connector_config: ConfigSessionHandleMixin
|
|
841
|
-
_session_handle: Optional[BaseSessionHandle] = field(default=None, init=False)
|
|
842
|
-
|
|
843
|
-
@property
|
|
844
|
-
def session_handle(self):
|
|
845
|
-
"""If a session handle is not assigned, creates a new one and assigns it."""
|
|
846
|
-
if self._session_handle is None:
|
|
847
|
-
self._session_handle = self.connector_config.create_session_handle()
|
|
848
|
-
return self._session_handle
|
|
849
|
-
|
|
850
|
-
@session_handle.setter
|
|
851
|
-
def session_handle(self, session_handle: BaseSessionHandle):
|
|
852
|
-
self._session_handle = session_handle
|