unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
import shutil
|
|
7
|
+
from dataclasses import InitVar, dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
|
|
12
|
+
from unstructured_ingest.v2.logger import logger, make_default_logger
|
|
13
|
+
from unstructured_ingest.v2.otel import OtelHandler
|
|
14
|
+
from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
|
|
15
|
+
from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
|
|
16
|
+
from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
|
|
17
|
+
from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
|
|
18
|
+
from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
|
|
19
|
+
from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
|
|
20
|
+
from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
|
|
21
|
+
from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
|
|
22
|
+
from unstructured_ingest.v2.pipeline.steps.upload import UploadStep
|
|
23
|
+
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
|
|
24
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
25
|
+
ConnectionConfig,
|
|
26
|
+
DownloaderConfigT,
|
|
27
|
+
IndexerConfigT,
|
|
28
|
+
UploaderConfigT,
|
|
29
|
+
UploadStagerConfigT,
|
|
30
|
+
destination_registry,
|
|
31
|
+
source_registry,
|
|
32
|
+
)
|
|
33
|
+
from unstructured_ingest.v2.processes.connectors.local import LocalUploader
|
|
34
|
+
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
|
|
35
|
+
from unstructured_ingest.v2.processes.filter import FiltererConfig
|
|
36
|
+
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class PipelineError(Exception):
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Pipeline:
|
|
45
|
+
context: ProcessorConfig
|
|
46
|
+
|
|
47
|
+
indexer: InitVar[IndexerT]
|
|
48
|
+
indexer_step: IndexStep = field(init=False)
|
|
49
|
+
|
|
50
|
+
downloader: InitVar[DownloaderT]
|
|
51
|
+
downloader_step: DownloadStep = field(init=False)
|
|
52
|
+
|
|
53
|
+
partitioner: InitVar[Partitioner]
|
|
54
|
+
partitioner_step: PartitionStep = field(init=False)
|
|
55
|
+
|
|
56
|
+
chunker: InitVar[Chunker | None] = None
|
|
57
|
+
chunker_step: ChunkStep | None = field(init=False, default=None)
|
|
58
|
+
|
|
59
|
+
embedder: InitVar[Embedder | None] = None
|
|
60
|
+
embedder_step: EmbedStep | None = field(init=False, default=None)
|
|
61
|
+
|
|
62
|
+
stager: InitVar[UploadStager | None] = None
|
|
63
|
+
stager_step: UploadStageStep | None = field(init=False, default=None)
|
|
64
|
+
|
|
65
|
+
uploader: InitVar[Uploader] = field(default=LocalUploader())
|
|
66
|
+
uploader_step: UploadStep | None = field(init=False, default=None)
|
|
67
|
+
|
|
68
|
+
uncompress_step: UncompressStep | None = field(init=False, default=None)
|
|
69
|
+
|
|
70
|
+
filterer: InitVar[Filterer | None] = None
|
|
71
|
+
filter_step: FilterStep | None = field(init=False, default=None)
|
|
72
|
+
|
|
73
|
+
def __post_init__(
|
|
74
|
+
self,
|
|
75
|
+
indexer: IndexerT,
|
|
76
|
+
downloader: DownloaderT,
|
|
77
|
+
partitioner: Partitioner,
|
|
78
|
+
chunker: Chunker | None = None,
|
|
79
|
+
embedder: Embedder | None = None,
|
|
80
|
+
stager: UploadStager | None = None,
|
|
81
|
+
uploader: Uploader | None = None,
|
|
82
|
+
filterer: Filterer | None = None,
|
|
83
|
+
):
|
|
84
|
+
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
|
|
85
|
+
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
|
|
86
|
+
otel_handler.init_trace()
|
|
87
|
+
self.indexer_step = IndexStep(process=indexer, context=self.context)
|
|
88
|
+
self.downloader_step = DownloadStep(process=downloader, context=self.context)
|
|
89
|
+
self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
|
|
90
|
+
self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
|
|
91
|
+
self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
|
|
92
|
+
|
|
93
|
+
self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
|
|
94
|
+
# TODO: support initialize() call from each step process
|
|
95
|
+
# Potential long call to download embedder models, run before any fanout:
|
|
96
|
+
if embedder and embedder.config:
|
|
97
|
+
embedder.config.get_embedder().initialize()
|
|
98
|
+
|
|
99
|
+
self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
|
|
100
|
+
self.uploader_step = UploadStep(process=uploader, context=self.context)
|
|
101
|
+
if self.context.uncompress:
|
|
102
|
+
process = Uncompressor()
|
|
103
|
+
self.uncompress_step = UncompressStep(process=process, context=self.context)
|
|
104
|
+
|
|
105
|
+
self.check_destination_connector()
|
|
106
|
+
|
|
107
|
+
def check_destination_connector(self):
|
|
108
|
+
# Make sure that if the set destination connector expects a stager, one is also set
|
|
109
|
+
if not self.uploader_step:
|
|
110
|
+
return
|
|
111
|
+
uploader_connector_type = self.uploader_step.process.connector_type
|
|
112
|
+
registry_entry = destination_registry[uploader_connector_type]
|
|
113
|
+
if registry_entry.upload_stager and self.stager_step is None:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
|
|
116
|
+
f"expects a stager of type {registry_entry.upload_stager.__name__} "
|
|
117
|
+
f"but one was not set"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def cleanup(self):
|
|
121
|
+
if self.context.delete_cache and Path(self.context.work_dir).exists():
|
|
122
|
+
logger.info(f"deleting cache directory: {self.context.work_dir}")
|
|
123
|
+
shutil.rmtree(self.context.work_dir)
|
|
124
|
+
|
|
125
|
+
def log_statuses(self):
|
|
126
|
+
if status := self.context.status:
|
|
127
|
+
logger.error(f"{len(status)} failed documents:")
|
|
128
|
+
for k, v in status.items():
|
|
129
|
+
for kk, vv in v.items():
|
|
130
|
+
logger.error(f"{k}: [{kk}] {vv}")
|
|
131
|
+
|
|
132
|
+
def run(self):
|
|
133
|
+
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
|
|
134
|
+
try:
|
|
135
|
+
with otel_handler.get_tracer().start_as_current_span(
|
|
136
|
+
"ingest process", record_exception=True
|
|
137
|
+
):
|
|
138
|
+
self._run_prechecks()
|
|
139
|
+
self._run()
|
|
140
|
+
finally:
|
|
141
|
+
self.log_statuses()
|
|
142
|
+
self.cleanup()
|
|
143
|
+
if self.context.status:
|
|
144
|
+
raise PipelineError("Pipeline did not run successfully")
|
|
145
|
+
|
|
146
|
+
def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
|
|
147
|
+
if not results:
|
|
148
|
+
return None
|
|
149
|
+
results = [r for r in results if r]
|
|
150
|
+
flat = []
|
|
151
|
+
for r in results:
|
|
152
|
+
if isinstance(r, list):
|
|
153
|
+
flat.extend(r)
|
|
154
|
+
else:
|
|
155
|
+
flat.append(r)
|
|
156
|
+
final = [f for f in flat if f]
|
|
157
|
+
return final or None
|
|
158
|
+
|
|
159
|
+
def _run_prechecks(self):
|
|
160
|
+
steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
|
|
161
|
+
if self.chunker_step:
|
|
162
|
+
steps.append(self.chunker_step)
|
|
163
|
+
if self.embedder_step:
|
|
164
|
+
steps.append(self.embedder_step)
|
|
165
|
+
if self.uncompress_step:
|
|
166
|
+
steps.append(self.uncompress_step)
|
|
167
|
+
if self.stager_step:
|
|
168
|
+
steps.append(self.stager_step)
|
|
169
|
+
failures = {}
|
|
170
|
+
for step in steps:
|
|
171
|
+
try:
|
|
172
|
+
step.process.precheck()
|
|
173
|
+
except Exception as e:
|
|
174
|
+
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
175
|
+
if failures:
|
|
176
|
+
for k, v in failures.items():
|
|
177
|
+
logger.error(f"Step precheck failure: {k}: {v}")
|
|
178
|
+
raise PipelineError("Precheck failed")
|
|
179
|
+
|
|
180
|
+
def apply_filter(self, records: list[dict]) -> list[dict]:
|
|
181
|
+
if not self.filter_step:
|
|
182
|
+
return records
|
|
183
|
+
data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
|
|
184
|
+
filtered_data = self.filter_step(data_to_filter)
|
|
185
|
+
filtered_data = [f for f in filtered_data if f is not None]
|
|
186
|
+
filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
|
|
187
|
+
filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
|
|
188
|
+
return filtered_records
|
|
189
|
+
|
|
190
|
+
def get_indices(self) -> list[dict]:
|
|
191
|
+
if self.indexer_step.process.is_async():
|
|
192
|
+
indices = asyncio.run(self.indexer_step.run_async())
|
|
193
|
+
else:
|
|
194
|
+
indices = self.indexer_step.run()
|
|
195
|
+
indices_inputs = [{"file_data_path": i} for i in indices]
|
|
196
|
+
return indices_inputs
|
|
197
|
+
|
|
198
|
+
def _run(self):
|
|
199
|
+
logger.info(
|
|
200
|
+
f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
|
|
201
|
+
)
|
|
202
|
+
if self.context.mp_supported:
|
|
203
|
+
manager = mp.Manager()
|
|
204
|
+
self.context.status = manager.dict()
|
|
205
|
+
else:
|
|
206
|
+
self.context.status = {}
|
|
207
|
+
|
|
208
|
+
# Index into data source
|
|
209
|
+
indices_inputs = self.get_indices()
|
|
210
|
+
if not indices_inputs:
|
|
211
|
+
logger.info("No files to process after indexer, exiting")
|
|
212
|
+
return
|
|
213
|
+
|
|
214
|
+
# Initial filtering on indexed content
|
|
215
|
+
indices_inputs = self.apply_filter(records=indices_inputs)
|
|
216
|
+
if not indices_inputs:
|
|
217
|
+
logger.info("No files to process after filtering indexed content, exiting")
|
|
218
|
+
return
|
|
219
|
+
|
|
220
|
+
# Download associated content to local file system
|
|
221
|
+
downloaded_data = self.downloader_step(indices_inputs)
|
|
222
|
+
downloaded_data = self.clean_results(results=downloaded_data)
|
|
223
|
+
if not downloaded_data:
|
|
224
|
+
logger.info("No files to process after downloader, exiting")
|
|
225
|
+
return
|
|
226
|
+
|
|
227
|
+
# Post download filtering
|
|
228
|
+
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
229
|
+
if not downloaded_data:
|
|
230
|
+
logger.info("No files to process after filtering downloaded content, exiting")
|
|
231
|
+
return
|
|
232
|
+
|
|
233
|
+
# Run uncompress if available
|
|
234
|
+
if self.uncompress_step:
|
|
235
|
+
downloaded_data = self.uncompress_step(downloaded_data)
|
|
236
|
+
# Flatten list of lists
|
|
237
|
+
downloaded_data = self.clean_results(results=downloaded_data)
|
|
238
|
+
|
|
239
|
+
# Post uncompress filtering
|
|
240
|
+
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
241
|
+
if not downloaded_data:
|
|
242
|
+
logger.info("No files to process after filtering uncompressed content, exiting")
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
if not downloaded_data or self.context.download_only:
|
|
246
|
+
return
|
|
247
|
+
|
|
248
|
+
# Partition content
|
|
249
|
+
elements = self.partitioner_step(downloaded_data)
|
|
250
|
+
# Download data non longer needed, delete if possible
|
|
251
|
+
self.downloader_step.delete_cache()
|
|
252
|
+
elements = self.clean_results(results=elements)
|
|
253
|
+
if not elements:
|
|
254
|
+
logger.info("No files to process after partitioning, exiting")
|
|
255
|
+
return
|
|
256
|
+
|
|
257
|
+
# Run element specific modifiers
|
|
258
|
+
last_step = self.partitioner_step
|
|
259
|
+
for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
|
|
260
|
+
elements = step(elements)
|
|
261
|
+
elements = self.clean_results(results=elements)
|
|
262
|
+
# Delete data from previous step if possible since no longer needed
|
|
263
|
+
last_step.delete_cache()
|
|
264
|
+
last_step = step
|
|
265
|
+
if not elements:
|
|
266
|
+
logger.info(f"no files to process after {step.__class__.__name__}, exiting")
|
|
267
|
+
return
|
|
268
|
+
|
|
269
|
+
# Upload the final result
|
|
270
|
+
self.uploader_step(iterable=elements)
|
|
271
|
+
last_step.delete_cache()
|
|
272
|
+
|
|
273
|
+
def __str__(self):
|
|
274
|
+
s = [str(self.indexer_step)]
|
|
275
|
+
if filter_step := self.filter_step:
|
|
276
|
+
s.append(str(filter_step))
|
|
277
|
+
s.append(str(self.downloader_step))
|
|
278
|
+
if filter_step := self.filter_step:
|
|
279
|
+
s.append(str(filter_step))
|
|
280
|
+
if uncompress_step := self.uncompress_step:
|
|
281
|
+
s.extend([str(uncompress_step), str(filter_step)])
|
|
282
|
+
s.append(str(self.partitioner_step))
|
|
283
|
+
if chunker_step := self.chunker_step:
|
|
284
|
+
s.append(str(chunker_step))
|
|
285
|
+
if embedder_step := self.embedder_step:
|
|
286
|
+
s.append(str(embedder_step))
|
|
287
|
+
if stager_step := self.stager_step:
|
|
288
|
+
s.append(str(stager_step))
|
|
289
|
+
s.append(str(self.uploader_step))
|
|
290
|
+
return " -> ".join(s)
|
|
291
|
+
|
|
292
|
+
@classmethod
|
|
293
|
+
def from_configs(
|
|
294
|
+
cls,
|
|
295
|
+
context: ProcessorConfig,
|
|
296
|
+
indexer_config: IndexerConfigT,
|
|
297
|
+
downloader_config: DownloaderConfigT,
|
|
298
|
+
source_connection_config: ConnectionConfig,
|
|
299
|
+
partitioner_config: PartitionerConfig,
|
|
300
|
+
filterer_config: FiltererConfig | None = None,
|
|
301
|
+
chunker_config: ChunkerConfig | None = None,
|
|
302
|
+
embedder_config: EmbedderConfig | None = None,
|
|
303
|
+
destination_connection_config: ConnectionConfig | None = None,
|
|
304
|
+
stager_config: UploadStagerConfigT | None = None,
|
|
305
|
+
uploader_config: UploaderConfigT | None = None,
|
|
306
|
+
) -> "Pipeline":
|
|
307
|
+
# Get registry key based on indexer config
|
|
308
|
+
source_entry = {
|
|
309
|
+
k: v
|
|
310
|
+
for k, v in source_registry.items()
|
|
311
|
+
if isinstance(indexer_config, v.indexer_config)
|
|
312
|
+
and isinstance(downloader_config, v.downloader_config)
|
|
313
|
+
and isinstance(source_connection_config, v.connection_config)
|
|
314
|
+
}
|
|
315
|
+
if len(source_entry) > 1:
|
|
316
|
+
raise ValueError(
|
|
317
|
+
f"multiple entries found matching provided indexer, "
|
|
318
|
+
f"downloader and connection configs: {source_entry}"
|
|
319
|
+
)
|
|
320
|
+
if len(source_entry) != 1:
|
|
321
|
+
raise ValueError(
|
|
322
|
+
"no entry found in source registry with matching indexer, "
|
|
323
|
+
"downloader and connection configs"
|
|
324
|
+
)
|
|
325
|
+
source = list(source_entry.values())[0]
|
|
326
|
+
pipeline_kwargs = {
|
|
327
|
+
"context": context,
|
|
328
|
+
"indexer": source.indexer(
|
|
329
|
+
index_config=indexer_config, connection_config=source_connection_config
|
|
330
|
+
),
|
|
331
|
+
"downloader": source.downloader(
|
|
332
|
+
download_config=downloader_config, connection_config=source_connection_config
|
|
333
|
+
),
|
|
334
|
+
"partitioner": Partitioner(config=partitioner_config),
|
|
335
|
+
}
|
|
336
|
+
if filterer_config:
|
|
337
|
+
pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
|
|
338
|
+
if chunker_config:
|
|
339
|
+
pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
|
|
340
|
+
if embedder_config:
|
|
341
|
+
pipeline_kwargs["embedder"] = Embedder(config=embedder_config)
|
|
342
|
+
if not uploader_config:
|
|
343
|
+
return Pipeline(**pipeline_kwargs)
|
|
344
|
+
|
|
345
|
+
destination_entry = {
|
|
346
|
+
k: v
|
|
347
|
+
for k, v in destination_registry.items()
|
|
348
|
+
if isinstance(uploader_config, v.uploader_config)
|
|
349
|
+
}
|
|
350
|
+
if destination_connection_config:
|
|
351
|
+
destination_entry = {
|
|
352
|
+
k: v
|
|
353
|
+
for k, v in destination_entry.items()
|
|
354
|
+
if isinstance(destination_connection_config, v.connection_config)
|
|
355
|
+
}
|
|
356
|
+
if stager_config:
|
|
357
|
+
destination_entry = {
|
|
358
|
+
k: v
|
|
359
|
+
for k, v in destination_entry.items()
|
|
360
|
+
if isinstance(stager_config, v.upload_stager_config)
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
if len(destination_entry) > 1:
|
|
364
|
+
raise ValueError(
|
|
365
|
+
f"multiple entries found matching provided uploader, "
|
|
366
|
+
f"stager and connection configs: {destination_entry}"
|
|
367
|
+
)
|
|
368
|
+
if len(destination_entry) != 1:
|
|
369
|
+
raise ValueError(
|
|
370
|
+
"no entry found in destination registry with matching uploader, "
|
|
371
|
+
"stager and connection configs"
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
destination = list(destination_entry.values())[0]
|
|
375
|
+
if stager_config:
|
|
376
|
+
pipeline_kwargs["stager"] = destination.upload_stager(
|
|
377
|
+
upload_stager_config=stager_config
|
|
378
|
+
)
|
|
379
|
+
if uploader_config:
|
|
380
|
+
uploader_kwargs = {"upload_config": uploader_config}
|
|
381
|
+
if destination_connection_config:
|
|
382
|
+
uploader_kwargs["connection_config"] = destination_connection_config
|
|
383
|
+
pipeline_kwargs["uploader"] = destination.uploader(**uploader_kwargs)
|
|
384
|
+
return cls(**pipeline_kwargs)
|
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Optional, TypedDict
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
|
+
from unstructured_ingest.v2.logger import logger
|
|
11
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
12
|
+
from unstructured_ingest.v2.processes.chunker import Chunker
|
|
13
|
+
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
14
|
+
|
|
15
|
+
STEP_ID = "chunk"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ChunkStepResponse(TypedDict):
|
|
19
|
+
file_data_path: str
|
|
20
|
+
path: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ChunkStep(PipelineStep):
|
|
25
|
+
process: Chunker
|
|
26
|
+
identifier: str = STEP_ID
|
|
27
|
+
|
|
28
|
+
def __str__(self):
|
|
29
|
+
return f"{self.identifier} ({self.process.config.chunking_strategy})"
|
|
30
|
+
|
|
31
|
+
def __post_init__(self):
|
|
32
|
+
config = self.process.config.model_dump_json() if self.process.config else None
|
|
33
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
34
|
+
|
|
35
|
+
def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
|
|
36
|
+
if self.context.reprocess or file_data.reprocess:
|
|
37
|
+
return True
|
|
38
|
+
return not filepath.exists()
|
|
39
|
+
|
|
40
|
+
def get_output_filepath(self, filename: Path) -> Path:
|
|
41
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
|
|
42
|
+
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
43
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
return filepath
|
|
45
|
+
|
|
46
|
+
def _save_output(self, output_filepath: str, chunked_content: list[dict]):
|
|
47
|
+
with open(str(output_filepath), "w") as f:
|
|
48
|
+
logger.debug(f"writing chunker output to: {output_filepath}")
|
|
49
|
+
json.dump(chunked_content, f, indent=2)
|
|
50
|
+
|
|
51
|
+
async def _run_async(
|
|
52
|
+
self, fn: Callable, path: str, file_data_path: str, **kwargs
|
|
53
|
+
) -> ChunkStepResponse:
|
|
54
|
+
path = Path(path)
|
|
55
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
56
|
+
output_filepath = self.get_output_filepath(filename=path)
|
|
57
|
+
if not self.should_chunk(filepath=output_filepath, file_data=file_data):
|
|
58
|
+
logger.debug(f"skipping chunking, output already exists: {output_filepath}")
|
|
59
|
+
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
60
|
+
fn_kwargs = {"elements_filepath": path}
|
|
61
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
62
|
+
chunked_content_raw = fn(**fn_kwargs)
|
|
63
|
+
elif semaphore := self.context.semaphore:
|
|
64
|
+
async with semaphore:
|
|
65
|
+
chunked_content_raw = await fn(**fn_kwargs)
|
|
66
|
+
else:
|
|
67
|
+
chunked_content_raw = await fn(**fn_kwargs)
|
|
68
|
+
self._save_output(
|
|
69
|
+
output_filepath=str(output_filepath),
|
|
70
|
+
chunked_content=chunked_content_raw,
|
|
71
|
+
)
|
|
72
|
+
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
73
|
+
|
|
74
|
+
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
75
|
+
hashable_string = serialize_base_model_json(
|
|
76
|
+
model=self.process.config, sort_keys=True, ensure_ascii=True
|
|
77
|
+
)
|
|
78
|
+
if extras:
|
|
79
|
+
hashable_string += "".join(extras)
|
|
80
|
+
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
import shutil
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable, Optional, TypedDict, TypeVar
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.v2.interfaces import FileData, download_responses
|
|
10
|
+
from unstructured_ingest.v2.interfaces.downloader import Downloader
|
|
11
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
12
|
+
from unstructured_ingest.v2.logger import logger
|
|
13
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
14
|
+
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
15
|
+
|
|
16
|
+
DownloaderT = TypeVar("DownloaderT", bound=Downloader)
|
|
17
|
+
|
|
18
|
+
STEP_ID = "download"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DownloadStepResponse(TypedDict):
|
|
22
|
+
file_data_path: str
|
|
23
|
+
path: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class DownloadStep(PipelineStep):
|
|
28
|
+
process: DownloaderT
|
|
29
|
+
identifier: str = STEP_ID
|
|
30
|
+
|
|
31
|
+
def __str__(self):
|
|
32
|
+
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
33
|
+
|
|
34
|
+
def __post_init__(self):
|
|
35
|
+
config = (
|
|
36
|
+
self.process.download_config.model_dump_json() if self.process.download_config else None
|
|
37
|
+
)
|
|
38
|
+
connection_config = (
|
|
39
|
+
self.process.connection_config.model_dump_json()
|
|
40
|
+
if self.process.connection_config
|
|
41
|
+
else None
|
|
42
|
+
)
|
|
43
|
+
logger.info(
|
|
44
|
+
f"Created {self.identifier} with configs: {config}, "
|
|
45
|
+
f"connection configs: {connection_config}"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def is_float(value: str):
|
|
50
|
+
try:
|
|
51
|
+
float(value)
|
|
52
|
+
return True
|
|
53
|
+
except ValueError:
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
def should_download(self, file_data: FileData, file_data_path: str) -> bool:
|
|
57
|
+
if self.context.re_download:
|
|
58
|
+
return True
|
|
59
|
+
download_path = self.process.get_download_path(file_data=file_data)
|
|
60
|
+
if not download_path or not download_path.exists():
|
|
61
|
+
return True
|
|
62
|
+
if (
|
|
63
|
+
download_path.is_file()
|
|
64
|
+
and file_data.metadata.date_modified
|
|
65
|
+
and self.is_float(file_data.metadata.date_modified)
|
|
66
|
+
and download_path.stat().st_mtime > float(file_data.metadata.date_modified)
|
|
67
|
+
):
|
|
68
|
+
# Also update file data to mark this to reprocess since this won't change the filename
|
|
69
|
+
file_data.reprocess = True
|
|
70
|
+
file_data.to_file(path=file_data_path)
|
|
71
|
+
return True
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
def update_file_data(
|
|
75
|
+
self, file_data: FileData, file_data_path: Path, download_path: Path
|
|
76
|
+
) -> None:
|
|
77
|
+
file_data.local_download_path = str(download_path.resolve())
|
|
78
|
+
file_size_bytes = download_path.stat().st_size
|
|
79
|
+
if not file_data.metadata.filesize_bytes and file_size_bytes:
|
|
80
|
+
file_data.metadata.filesize_bytes = file_size_bytes
|
|
81
|
+
if (
|
|
82
|
+
file_data.metadata.filesize_bytes
|
|
83
|
+
and file_data.metadata.filesize_bytes != file_size_bytes
|
|
84
|
+
):
|
|
85
|
+
logger.warning(
|
|
86
|
+
f"file size in original file data "
|
|
87
|
+
f"({file_data.metadata.filesize_bytes}) doesn't "
|
|
88
|
+
f"match size of local file: {file_size_bytes}, updating"
|
|
89
|
+
)
|
|
90
|
+
file_data.metadata.filesize_bytes = file_size_bytes
|
|
91
|
+
logger.debug(f"updating file data with new content: {file_data.model_dump()}")
|
|
92
|
+
with file_data_path.open("w") as file:
|
|
93
|
+
json.dump(file_data.model_dump(), file, indent=2)
|
|
94
|
+
|
|
95
|
+
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
96
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
97
|
+
download_path = self.process.get_download_path(file_data=file_data)
|
|
98
|
+
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
99
|
+
logger.debug(f"skipping download, file already exists locally: {download_path}")
|
|
100
|
+
self.update_file_data(
|
|
101
|
+
file_data=file_data,
|
|
102
|
+
file_data_path=Path(file_data_path),
|
|
103
|
+
download_path=download_path,
|
|
104
|
+
)
|
|
105
|
+
return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
|
|
106
|
+
fn_kwargs = {"file_data": file_data}
|
|
107
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
108
|
+
download_results = fn(**fn_kwargs)
|
|
109
|
+
elif semaphore := self.context.semaphore:
|
|
110
|
+
async with semaphore:
|
|
111
|
+
download_results = await fn(**fn_kwargs)
|
|
112
|
+
else:
|
|
113
|
+
download_results = await fn(**fn_kwargs)
|
|
114
|
+
return self.create_step_results(
|
|
115
|
+
current_file_data_path=file_data_path,
|
|
116
|
+
download_results=download_results,
|
|
117
|
+
current_file_data=file_data,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def create_step_results(
|
|
121
|
+
self,
|
|
122
|
+
current_file_data_path: str,
|
|
123
|
+
current_file_data: FileData,
|
|
124
|
+
download_results: download_responses,
|
|
125
|
+
) -> list[DownloadStepResponse]:
|
|
126
|
+
responses = []
|
|
127
|
+
if not isinstance(download_results, list):
|
|
128
|
+
file_data = current_file_data
|
|
129
|
+
file_data_path = current_file_data_path
|
|
130
|
+
download_path = download_results["path"]
|
|
131
|
+
if download_results["file_data"].identifier == current_file_data.identifier:
|
|
132
|
+
self.update_file_data(
|
|
133
|
+
file_data=file_data,
|
|
134
|
+
file_data_path=Path(file_data_path),
|
|
135
|
+
download_path=download_path,
|
|
136
|
+
)
|
|
137
|
+
responses = [
|
|
138
|
+
DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
|
|
139
|
+
]
|
|
140
|
+
else:
|
|
141
|
+
file_data = download_results["file_data"]
|
|
142
|
+
file_data_path = self.persist_new_file_data(file_data=file_data)
|
|
143
|
+
self.update_file_data(
|
|
144
|
+
file_data=file_data,
|
|
145
|
+
file_data_path=Path(file_data_path),
|
|
146
|
+
download_path=download_path,
|
|
147
|
+
)
|
|
148
|
+
responses = [
|
|
149
|
+
DownloadStepResponse(
|
|
150
|
+
file_data_path=current_file_data_path, path=str(download_results["path"])
|
|
151
|
+
)
|
|
152
|
+
]
|
|
153
|
+
else:
|
|
154
|
+
# Supplemental results generated as part of the download process
|
|
155
|
+
for res in download_results:
|
|
156
|
+
file_data = res["file_data"]
|
|
157
|
+
file_data_path = self.persist_new_file_data(file_data=file_data)
|
|
158
|
+
download_path = res["path"]
|
|
159
|
+
self.update_file_data(
|
|
160
|
+
file_data=file_data,
|
|
161
|
+
file_data_path=Path(file_data_path),
|
|
162
|
+
download_path=download_path,
|
|
163
|
+
)
|
|
164
|
+
responses.append(
|
|
165
|
+
DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return responses
|
|
169
|
+
|
|
170
|
+
def persist_new_file_data(self, file_data: FileData) -> str:
|
|
171
|
+
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
172
|
+
filename = f"{record_hash}.json"
|
|
173
|
+
filepath = (self.cache_dir / filename).resolve()
|
|
174
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
175
|
+
with open(str(filepath), "w") as f:
|
|
176
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
177
|
+
return str(filepath)
|
|
178
|
+
|
|
179
|
+
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
180
|
+
download_config_dict = json.loads(
|
|
181
|
+
serialize_base_model_json(model=self.process.download_config)
|
|
182
|
+
)
|
|
183
|
+
connection_config_dict = json.loads(
|
|
184
|
+
serialize_base_model_json(model=self.process.connection_config)
|
|
185
|
+
)
|
|
186
|
+
hashable_dict = {
|
|
187
|
+
"download_config": download_config_dict,
|
|
188
|
+
"connection_config": connection_config_dict,
|
|
189
|
+
}
|
|
190
|
+
hashable_string = json.dumps(hashable_dict, sort_keys=True)
|
|
191
|
+
if extras:
|
|
192
|
+
hashable_string += "".join(extras)
|
|
193
|
+
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
194
|
+
|
|
195
|
+
@property
|
|
196
|
+
def cache_dir(self) -> Path:
|
|
197
|
+
return self.process.download_config.download_dir
|
|
198
|
+
|
|
199
|
+
def delete_cache(self):
|
|
200
|
+
if (
|
|
201
|
+
self.context.iter_delete
|
|
202
|
+
and not self.context.preserve_downloads
|
|
203
|
+
and self.cache_dir.exists()
|
|
204
|
+
):
|
|
205
|
+
cache_dir = self.cache_dir
|
|
206
|
+
logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
|
|
207
|
+
shutil.rmtree(cache_dir)
|