unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import multiprocessing as mp
|
|
3
|
+
from dataclasses import InitVar, dataclass, field
|
|
4
|
+
from time import time
|
|
5
|
+
from typing import Any, Optional, Union
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.interfaces import ProcessorConfig
|
|
8
|
+
from unstructured_ingest.v2.logger import logger, make_default_logger
|
|
9
|
+
from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
|
|
10
|
+
from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
|
|
11
|
+
from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
|
|
12
|
+
from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
|
|
13
|
+
from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
|
|
14
|
+
from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
|
|
15
|
+
from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
|
|
16
|
+
from unstructured_ingest.v2.pipeline.steps.upload import Uploader, UploadStep
|
|
17
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
18
|
+
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
|
|
19
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
20
|
+
ConnectionConfig,
|
|
21
|
+
DownloaderConfigT,
|
|
22
|
+
IndexerConfigT,
|
|
23
|
+
UploaderConfigT,
|
|
24
|
+
UploadStagerConfigT,
|
|
25
|
+
destination_registry,
|
|
26
|
+
source_registry,
|
|
27
|
+
)
|
|
28
|
+
from unstructured_ingest.v2.processes.connectors.local import LocalUploader
|
|
29
|
+
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
|
|
30
|
+
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PipelineError(Exception):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class Pipeline:
|
|
39
|
+
context: ProcessorConfig
|
|
40
|
+
indexer: InitVar[IndexerT]
|
|
41
|
+
indexer_step: IndexStep = field(init=False)
|
|
42
|
+
downloader: InitVar[DownloaderT]
|
|
43
|
+
downloader_step: DownloadStep = field(init=False)
|
|
44
|
+
partitioner: InitVar[Partitioner]
|
|
45
|
+
partitioner_step: PartitionStep = field(init=False)
|
|
46
|
+
chunker: InitVar[Optional[Chunker]] = None
|
|
47
|
+
chunker_step: ChunkStep = field(init=False, default=None)
|
|
48
|
+
embedder: InitVar[Optional[Embedder]] = None
|
|
49
|
+
embedder_step: EmbedStep = field(init=False, default=None)
|
|
50
|
+
stager: InitVar[Optional[UploadStager]] = None
|
|
51
|
+
stager_step: UploadStageStep = field(init=False, default=None)
|
|
52
|
+
uploader: InitVar[Uploader] = field(default=LocalUploader())
|
|
53
|
+
uploader_step: UploadStep = field(init=False, default=None)
|
|
54
|
+
uncompress_step: UncompressStep = field(init=False, default=None)
|
|
55
|
+
|
|
56
|
+
def __post_init__(
|
|
57
|
+
self,
|
|
58
|
+
indexer: IndexerT,
|
|
59
|
+
downloader: DownloaderT,
|
|
60
|
+
partitioner: Partitioner,
|
|
61
|
+
chunker: Chunker = None,
|
|
62
|
+
embedder: Embedder = None,
|
|
63
|
+
stager: UploadStager = None,
|
|
64
|
+
uploader: Uploader = None,
|
|
65
|
+
):
|
|
66
|
+
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
|
|
67
|
+
self.indexer_step = IndexStep(process=indexer, context=self.context)
|
|
68
|
+
self.downloader_step = DownloadStep(process=downloader, context=self.context)
|
|
69
|
+
self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
|
|
70
|
+
self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
|
|
71
|
+
|
|
72
|
+
self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
|
|
73
|
+
# TODO: support initialize() call from each step process
|
|
74
|
+
# Potential long call to download embedder models, run before any fanout:
|
|
75
|
+
if embedder and embedder.config:
|
|
76
|
+
embedder.config.get_embedder().initialize()
|
|
77
|
+
|
|
78
|
+
self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
|
|
79
|
+
self.uploader_step = UploadStep(process=uploader, context=self.context)
|
|
80
|
+
if self.context.uncompress:
|
|
81
|
+
process = Uncompressor()
|
|
82
|
+
self.uncompress_step = UncompressStep(process=process, context=self.context)
|
|
83
|
+
|
|
84
|
+
self.check_destination_connector()
|
|
85
|
+
|
|
86
|
+
def check_destination_connector(self):
|
|
87
|
+
# Make sure that if the set destination connector expects a stager, one is also set
|
|
88
|
+
if not self.uploader_step:
|
|
89
|
+
return
|
|
90
|
+
uploader_connector_type = self.uploader_step.process.connector_type
|
|
91
|
+
registry_entry = destination_registry[uploader_connector_type]
|
|
92
|
+
if registry_entry.upload_stager and self.stager_step is None:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
|
|
95
|
+
f"expects a stager of type {registry_entry.upload_stager.__name__} "
|
|
96
|
+
f"but one was not set"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def cleanup(self):
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
def log_statuses(self):
|
|
103
|
+
if status := self.context.status:
|
|
104
|
+
logger.error(f"{len(status)} failed documents:")
|
|
105
|
+
for k, v in status.items():
|
|
106
|
+
for kk, vv in v.items():
|
|
107
|
+
logger.error(f"{k}: [{kk}] {vv}")
|
|
108
|
+
|
|
109
|
+
def run(self):
|
|
110
|
+
try:
|
|
111
|
+
start_time = time()
|
|
112
|
+
self._run()
|
|
113
|
+
logger.info(f"Finished ingest process in {time() - start_time}s")
|
|
114
|
+
finally:
|
|
115
|
+
self.log_statuses()
|
|
116
|
+
self.cleanup()
|
|
117
|
+
if self.context.status:
|
|
118
|
+
raise PipelineError("Pipeline did not run successfully")
|
|
119
|
+
|
|
120
|
+
def clean_results(self, results: Optional[list[Union[Any, list[Any]]]]) -> Optional[list[Any]]:
|
|
121
|
+
if not results:
|
|
122
|
+
return None
|
|
123
|
+
results = [r for r in results if r]
|
|
124
|
+
flat = []
|
|
125
|
+
for r in results:
|
|
126
|
+
if isinstance(r, list):
|
|
127
|
+
flat.extend(r)
|
|
128
|
+
else:
|
|
129
|
+
flat.append(r)
|
|
130
|
+
final = [f for f in flat if f]
|
|
131
|
+
return final or None
|
|
132
|
+
|
|
133
|
+
def _run(self):
|
|
134
|
+
logger.info(
|
|
135
|
+
f"Running local pipline: {self} with configs: "
|
|
136
|
+
f"{sterilize_dict(self.context.to_dict(redact_sensitive=True))}"
|
|
137
|
+
)
|
|
138
|
+
if self.context.mp_supported:
|
|
139
|
+
manager = mp.Manager()
|
|
140
|
+
self.context.status = manager.dict()
|
|
141
|
+
else:
|
|
142
|
+
self.context.status = {}
|
|
143
|
+
|
|
144
|
+
# Index into data source
|
|
145
|
+
indices = self.indexer_step.run()
|
|
146
|
+
indices_inputs = [{"file_data_path": i} for i in indices]
|
|
147
|
+
if not indices_inputs:
|
|
148
|
+
return
|
|
149
|
+
|
|
150
|
+
# Download associated content to local file system
|
|
151
|
+
downloaded_data = self.downloader_step(indices_inputs)
|
|
152
|
+
downloaded_data = self.clean_results(results=downloaded_data)
|
|
153
|
+
if not downloaded_data:
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
# Run uncompress if available
|
|
157
|
+
if self.uncompress_step:
|
|
158
|
+
downloaded_data = self.uncompress_step(downloaded_data)
|
|
159
|
+
# Flatten list of lists
|
|
160
|
+
downloaded_data = self.clean_results(results=downloaded_data)
|
|
161
|
+
|
|
162
|
+
if not downloaded_data:
|
|
163
|
+
return
|
|
164
|
+
|
|
165
|
+
# Partition content
|
|
166
|
+
elements = self.partitioner_step(downloaded_data)
|
|
167
|
+
elements = self.clean_results(results=elements)
|
|
168
|
+
if not elements:
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
# Run element specific modifiers
|
|
172
|
+
for step in [self.chunker_step, self.embedder_step, self.stager_step]:
|
|
173
|
+
elements = step(elements) if step else elements
|
|
174
|
+
elements = self.clean_results(results=elements)
|
|
175
|
+
if not elements:
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
# Upload the final result
|
|
179
|
+
self.uploader_step(iterable=elements)
|
|
180
|
+
|
|
181
|
+
def __str__(self):
|
|
182
|
+
s = [str(self.indexer_step), str(self.downloader_step)]
|
|
183
|
+
if uncompress_step := self.uncompress_step:
|
|
184
|
+
s.append(str(uncompress_step))
|
|
185
|
+
s.append(str(self.partitioner_step))
|
|
186
|
+
if chunker_step := self.chunker_step:
|
|
187
|
+
s.append(str(chunker_step))
|
|
188
|
+
if embedder_step := self.embedder_step:
|
|
189
|
+
s.append(str(embedder_step))
|
|
190
|
+
if stager_step := self.stager_step:
|
|
191
|
+
s.append(str(stager_step))
|
|
192
|
+
s.append(str(self.uploader_step))
|
|
193
|
+
return " -> ".join(s)
|
|
194
|
+
|
|
195
|
+
@classmethod
|
|
196
|
+
def from_configs(
|
|
197
|
+
cls,
|
|
198
|
+
context: ProcessorConfig,
|
|
199
|
+
indexer_config: IndexerConfigT,
|
|
200
|
+
downloader_config: DownloaderConfigT,
|
|
201
|
+
source_connection_config: ConnectionConfig,
|
|
202
|
+
partitioner_config: PartitionerConfig,
|
|
203
|
+
chunker_config: Optional[ChunkerConfig] = None,
|
|
204
|
+
embedder_config: Optional[EmbedderConfig] = None,
|
|
205
|
+
destination_connection_config: Optional[ConnectionConfig] = None,
|
|
206
|
+
stager_config: Optional[UploadStagerConfigT] = None,
|
|
207
|
+
uploader_config: Optional[UploaderConfigT] = None,
|
|
208
|
+
) -> "Pipeline":
|
|
209
|
+
# Get registry key based on indexer config
|
|
210
|
+
source_entry = {
|
|
211
|
+
k: v
|
|
212
|
+
for k, v in source_registry.items()
|
|
213
|
+
if isinstance(indexer_config, v.indexer_config)
|
|
214
|
+
and isinstance(downloader_config, v.downloader_config)
|
|
215
|
+
and isinstance(source_connection_config, v.connection_config)
|
|
216
|
+
}
|
|
217
|
+
if len(source_entry) > 1:
|
|
218
|
+
raise ValueError(
|
|
219
|
+
f"multiple entries found matching provided indexer, "
|
|
220
|
+
f"downloader and connection configs: {source_entry}"
|
|
221
|
+
)
|
|
222
|
+
if len(source_entry) != 1:
|
|
223
|
+
raise ValueError(
|
|
224
|
+
"no entry found in source registry with matching indexer, "
|
|
225
|
+
"downloader and connection configs"
|
|
226
|
+
)
|
|
227
|
+
source = list(source_entry.values())[0]
|
|
228
|
+
pipeline_kwargs = {
|
|
229
|
+
"context": context,
|
|
230
|
+
"indexer": source.indexer(
|
|
231
|
+
index_config=indexer_config, connection_config=source_connection_config
|
|
232
|
+
),
|
|
233
|
+
"downloader": source.downloader(
|
|
234
|
+
download_config=downloader_config, connection_config=source_connection_config
|
|
235
|
+
),
|
|
236
|
+
"partitioner": Partitioner(config=partitioner_config),
|
|
237
|
+
}
|
|
238
|
+
if chunker_config:
|
|
239
|
+
pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
|
|
240
|
+
if embedder_config:
|
|
241
|
+
pipeline_kwargs["embedder"] = Embedder(config=embedder_config)
|
|
242
|
+
if not uploader_config:
|
|
243
|
+
return Pipeline(**pipeline_kwargs)
|
|
244
|
+
|
|
245
|
+
destination_entry = {
|
|
246
|
+
k: v
|
|
247
|
+
for k, v in destination_registry.items()
|
|
248
|
+
if isinstance(uploader_config, v.uploader_config)
|
|
249
|
+
}
|
|
250
|
+
if destination_connection_config:
|
|
251
|
+
destination_entry = {
|
|
252
|
+
k: v
|
|
253
|
+
for k, v in destination_entry.items()
|
|
254
|
+
if isinstance(destination_connection_config, v.connection_config)
|
|
255
|
+
}
|
|
256
|
+
if stager_config:
|
|
257
|
+
destination_entry = {
|
|
258
|
+
k: v
|
|
259
|
+
for k, v in destination_entry.items()
|
|
260
|
+
if isinstance(stager_config, v.upload_stager_config)
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if len(destination_entry) > 1:
|
|
264
|
+
raise ValueError(
|
|
265
|
+
f"multiple entries found matching provided uploader, "
|
|
266
|
+
f"stager and connection configs: {destination_entry}"
|
|
267
|
+
)
|
|
268
|
+
if len(destination_entry) != 1:
|
|
269
|
+
raise ValueError(
|
|
270
|
+
"no entry found in source registry with matching uploader, "
|
|
271
|
+
"stager and connection configs"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
destination = list(destination_entry.values())[0]
|
|
275
|
+
if stager_config:
|
|
276
|
+
pipeline_kwargs["stager"] = destination.upload_stager(
|
|
277
|
+
upload_stager_config=stager_config
|
|
278
|
+
)
|
|
279
|
+
if uploader_config:
|
|
280
|
+
uploader_kwargs = {"upload_config": uploader_config}
|
|
281
|
+
if destination_connection_config:
|
|
282
|
+
uploader_kwargs["connection_config"] = destination_connection_config
|
|
283
|
+
pipeline_kwargs["uploader"] = destination.uploader(**uploader_kwargs)
|
|
284
|
+
return cls(**pipeline_kwargs)
|
|
File without changes
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Optional, TypedDict
|
|
7
|
+
|
|
8
|
+
from unstructured.staging.base import elements_to_dicts
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
11
|
+
from unstructured_ingest.v2.logger import logger
|
|
12
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
13
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
14
|
+
from unstructured_ingest.v2.processes.chunker import Chunker
|
|
15
|
+
|
|
16
|
+
STEP_ID = "chunk"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChunkStepResponse(TypedDict):
|
|
20
|
+
file_data_path: str
|
|
21
|
+
path: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ChunkStep(PipelineStep):
|
|
26
|
+
process: Chunker
|
|
27
|
+
identifier: str = STEP_ID
|
|
28
|
+
|
|
29
|
+
def __str__(self):
|
|
30
|
+
return f"{self.identifier} ({self.process.config.chunking_strategy})"
|
|
31
|
+
|
|
32
|
+
def __post_init__(self):
|
|
33
|
+
config = (
|
|
34
|
+
sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
|
|
35
|
+
if self.process.config
|
|
36
|
+
else None
|
|
37
|
+
)
|
|
38
|
+
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
39
|
+
|
|
40
|
+
def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
|
|
41
|
+
if self.context.reprocess or file_data.reprocess:
|
|
42
|
+
return True
|
|
43
|
+
return not filepath.exists()
|
|
44
|
+
|
|
45
|
+
def get_output_filepath(self, filename: Path) -> Path:
|
|
46
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
|
|
47
|
+
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
48
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
return filepath
|
|
50
|
+
|
|
51
|
+
def _save_output(self, output_filepath: str, chunked_content: list[dict]):
|
|
52
|
+
with open(str(output_filepath), "w") as f:
|
|
53
|
+
logger.debug(f"Writing chunker output to: {output_filepath}")
|
|
54
|
+
json.dump(chunked_content, f, indent=2)
|
|
55
|
+
|
|
56
|
+
async def _run_async(
|
|
57
|
+
self, fn: Callable, path: str, file_data_path: str, **kwargs
|
|
58
|
+
) -> ChunkStepResponse:
|
|
59
|
+
path = Path(path)
|
|
60
|
+
file_data = FileData.from_file(path=file_data_path)
|
|
61
|
+
output_filepath = self.get_output_filepath(filename=path)
|
|
62
|
+
if not self.should_chunk(filepath=output_filepath, file_data=file_data):
|
|
63
|
+
logger.debug(f"Skipping chunking, output already exists: {output_filepath}")
|
|
64
|
+
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
65
|
+
fn_kwargs = {"elements_filepath": path}
|
|
66
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
67
|
+
chunked_content_raw = fn(**fn_kwargs)
|
|
68
|
+
elif semaphore := self.context.semaphore:
|
|
69
|
+
async with semaphore:
|
|
70
|
+
chunked_content_raw = await fn(**fn_kwargs)
|
|
71
|
+
else:
|
|
72
|
+
chunked_content_raw = await fn(**fn_kwargs)
|
|
73
|
+
self._save_output(
|
|
74
|
+
output_filepath=str(output_filepath),
|
|
75
|
+
chunked_content=elements_to_dicts(chunked_content_raw),
|
|
76
|
+
)
|
|
77
|
+
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
78
|
+
|
|
79
|
+
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
80
|
+
hashable_string = json.dumps(
|
|
81
|
+
self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
|
|
82
|
+
)
|
|
83
|
+
if extras:
|
|
84
|
+
hashable_string += "".join(extras)
|
|
85
|
+
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Callable, Optional, TypedDict, TypeVar
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.interfaces import FileData, download_responses
|
|
8
|
+
from unstructured_ingest.v2.interfaces.downloader import Downloader
|
|
9
|
+
from unstructured_ingest.v2.logger import logger
|
|
10
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
12
|
+
|
|
13
|
+
DownloaderT = TypeVar("DownloaderT", bound=Downloader)
|
|
14
|
+
|
|
15
|
+
STEP_ID = "download"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DownloadStepResponse(TypedDict):
|
|
19
|
+
file_data_path: str
|
|
20
|
+
path: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class DownloadStep(PipelineStep):
|
|
25
|
+
process: DownloaderT
|
|
26
|
+
identifier: str = STEP_ID
|
|
27
|
+
|
|
28
|
+
def __str__(self):
|
|
29
|
+
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
30
|
+
|
|
31
|
+
def __post_init__(self):
|
|
32
|
+
config = (
|
|
33
|
+
sterilize_dict(self.process.download_config.to_dict(redact_sensitive=True))
|
|
34
|
+
if self.process.download_config
|
|
35
|
+
else None
|
|
36
|
+
)
|
|
37
|
+
connection_config = (
|
|
38
|
+
sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
|
|
39
|
+
if self.process.connection_config
|
|
40
|
+
else None
|
|
41
|
+
)
|
|
42
|
+
logger.info(
|
|
43
|
+
f"Created {self.identifier} with configs: {config}, "
|
|
44
|
+
f"connection configs: {connection_config}"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def is_float(value: str):
|
|
49
|
+
try:
|
|
50
|
+
float(value)
|
|
51
|
+
return True
|
|
52
|
+
except ValueError:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
def should_download(self, file_data: FileData, file_data_path: str) -> bool:
|
|
56
|
+
if self.context.re_download:
|
|
57
|
+
return True
|
|
58
|
+
download_path = self.process.get_download_path(file_data=file_data)
|
|
59
|
+
if not download_path or not download_path.exists():
|
|
60
|
+
return True
|
|
61
|
+
if (
|
|
62
|
+
download_path.is_file()
|
|
63
|
+
and file_data.metadata.date_modified
|
|
64
|
+
and self.is_float(file_data.metadata.date_modified)
|
|
65
|
+
and download_path.stat().st_mtime > float(file_data.metadata.date_modified)
|
|
66
|
+
):
|
|
67
|
+
# Also update file data to mark this to reprocess since this won't change the filename
|
|
68
|
+
file_data.reprocess = True
|
|
69
|
+
file_data.to_file(path=file_data_path)
|
|
70
|
+
return True
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
74
|
+
file_data = FileData.from_file(path=file_data_path)
|
|
75
|
+
download_path = self.process.get_download_path(file_data=file_data)
|
|
76
|
+
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
77
|
+
logger.debug(f"Skipping download, file already exists locally: {download_path}")
|
|
78
|
+
return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
|
|
79
|
+
fn_kwargs = {"file_data": file_data}
|
|
80
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
81
|
+
download_results = fn(**fn_kwargs)
|
|
82
|
+
elif semaphore := self.context.semaphore:
|
|
83
|
+
async with semaphore:
|
|
84
|
+
download_results = await fn(**fn_kwargs)
|
|
85
|
+
else:
|
|
86
|
+
download_results = await fn(**fn_kwargs)
|
|
87
|
+
return self.create_step_results(
|
|
88
|
+
current_file_data_path=file_data_path, download_results=download_results
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def create_step_results(
|
|
92
|
+
self, current_file_data_path: str, download_results: download_responses
|
|
93
|
+
) -> list[DownloadStepResponse]:
|
|
94
|
+
if not isinstance(download_results, list):
|
|
95
|
+
return [
|
|
96
|
+
DownloadStepResponse(
|
|
97
|
+
file_data_path=current_file_data_path, path=str(download_results["path"])
|
|
98
|
+
)
|
|
99
|
+
]
|
|
100
|
+
# Supplemental results generated as part of the download process
|
|
101
|
+
download_step_results = []
|
|
102
|
+
for res in download_results:
|
|
103
|
+
file_data_path = self.persist_new_file_data(file_data=res["file_data"])
|
|
104
|
+
download_step_results.append(
|
|
105
|
+
DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
|
|
106
|
+
)
|
|
107
|
+
return download_step_results
|
|
108
|
+
|
|
109
|
+
def persist_new_file_data(self, file_data: FileData) -> str:
|
|
110
|
+
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
111
|
+
filename = f"{record_hash}.json"
|
|
112
|
+
filepath = (self.cache_dir / filename).resolve()
|
|
113
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
with open(str(filepath), "w") as f:
|
|
115
|
+
json.dump(file_data.to_dict(), f, indent=2)
|
|
116
|
+
return str(filepath)
|
|
117
|
+
|
|
118
|
+
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
119
|
+
hashable_string = json.dumps(
|
|
120
|
+
sterilize_dict(self.process.download_config.to_dict()), sort_keys=True
|
|
121
|
+
)
|
|
122
|
+
if extras:
|
|
123
|
+
hashable_string += "".join(extras)
|
|
124
|
+
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Optional, TypedDict
|
|
7
|
+
|
|
8
|
+
from unstructured.staging.base import elements_to_dicts
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
11
|
+
from unstructured_ingest.v2.logger import logger
|
|
12
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
13
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
14
|
+
from unstructured_ingest.v2.processes.embedder import Embedder
|
|
15
|
+
|
|
16
|
+
STEP_ID = "embed"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EmbedStepResponse(TypedDict):
|
|
20
|
+
file_data_path: str
|
|
21
|
+
path: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class EmbedStep(PipelineStep):
|
|
26
|
+
process: Embedder
|
|
27
|
+
identifier: str = STEP_ID
|
|
28
|
+
|
|
29
|
+
def __str__(self):
|
|
30
|
+
return f"{self.identifier} ({self.process.config.embedding_provider})"
|
|
31
|
+
|
|
32
|
+
def __post_init__(self):
|
|
33
|
+
config = (
|
|
34
|
+
sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
|
|
35
|
+
if self.process.config
|
|
36
|
+
else None
|
|
37
|
+
)
|
|
38
|
+
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
39
|
+
|
|
40
|
+
def should_embed(self, filepath: Path, file_data: FileData) -> bool:
|
|
41
|
+
if self.context.reprocess or file_data.reprocess:
|
|
42
|
+
return True
|
|
43
|
+
return not filepath.exists()
|
|
44
|
+
|
|
45
|
+
def get_output_filepath(self, filename: Path) -> Path:
|
|
46
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
|
|
47
|
+
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
48
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
return filepath
|
|
50
|
+
|
|
51
|
+
def _save_output(self, output_filepath: str, embedded_content: list[dict]):
|
|
52
|
+
with open(str(output_filepath), "w") as f:
|
|
53
|
+
logger.debug(f"Writing embedded output to: {output_filepath}")
|
|
54
|
+
json.dump(embedded_content, f, indent=2)
|
|
55
|
+
|
|
56
|
+
async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
|
|
57
|
+
path = Path(path)
|
|
58
|
+
file_data = FileData.from_file(path=file_data_path)
|
|
59
|
+
output_filepath = self.get_output_filepath(filename=path)
|
|
60
|
+
if not self.should_embed(filepath=output_filepath, file_data=file_data):
|
|
61
|
+
logger.debug(f"Skipping embedding, output already exists: {output_filepath}")
|
|
62
|
+
return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
63
|
+
fn_kwargs = {"elements_filepath": path}
|
|
64
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
65
|
+
embed_content_raw = fn(**fn_kwargs)
|
|
66
|
+
elif semaphore := self.context.semaphore:
|
|
67
|
+
async with semaphore:
|
|
68
|
+
embed_content_raw = await fn(**fn_kwargs)
|
|
69
|
+
else:
|
|
70
|
+
embed_content_raw = await fn(**fn_kwargs)
|
|
71
|
+
|
|
72
|
+
self._save_output(
|
|
73
|
+
output_filepath=str(output_filepath),
|
|
74
|
+
embedded_content=elements_to_dicts(embed_content_raw),
|
|
75
|
+
)
|
|
76
|
+
return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
77
|
+
|
|
78
|
+
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
79
|
+
hashable_string = json.dumps(
|
|
80
|
+
self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
|
|
81
|
+
)
|
|
82
|
+
if extras:
|
|
83
|
+
hashable_string += "".join(extras)
|
|
84
|
+
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Generator, Optional, TypeVar
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces.indexer import Indexer
|
|
7
|
+
from unstructured_ingest.v2.logger import logger
|
|
8
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
9
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
10
|
+
|
|
11
|
+
IndexerT = TypeVar("IndexerT", bound=Indexer)
|
|
12
|
+
|
|
13
|
+
STEP_ID = "index"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class IndexStep(PipelineStep):
|
|
18
|
+
process: IndexerT
|
|
19
|
+
identifier: str = STEP_ID
|
|
20
|
+
|
|
21
|
+
def __str__(self):
|
|
22
|
+
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
23
|
+
|
|
24
|
+
def __post_init__(self):
|
|
25
|
+
config = (
|
|
26
|
+
sterilize_dict(self.process.index_config.to_dict(redact_sensitive=True))
|
|
27
|
+
if self.process.index_config
|
|
28
|
+
else None
|
|
29
|
+
)
|
|
30
|
+
connection_config = (
|
|
31
|
+
sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
|
|
32
|
+
if self.process.connection_config
|
|
33
|
+
else None
|
|
34
|
+
)
|
|
35
|
+
logger.info(
|
|
36
|
+
f"Created {self.identifier} with configs: {config}, "
|
|
37
|
+
f"connection configs: {connection_config}"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def run(self) -> Generator[str, None, None]:
|
|
41
|
+
for file_data in self.process.run():
|
|
42
|
+
logger.debug(f"Generated file data: {file_data.to_dict()}")
|
|
43
|
+
try:
|
|
44
|
+
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
45
|
+
filename = f"{record_hash}.json"
|
|
46
|
+
filepath = (self.cache_dir / filename).resolve()
|
|
47
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
with open(str(filepath), "w") as f:
|
|
49
|
+
json.dump(file_data.to_dict(), f, indent=2)
|
|
50
|
+
yield str(filepath)
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
53
|
+
if self.context.raise_on_error:
|
|
54
|
+
raise e
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
58
|
+
hashable_string = json.dumps(self.process.index_config.to_dict())
|
|
59
|
+
if extras:
|
|
60
|
+
hashable_string += "".join(extras)
|
|
61
|
+
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|