unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, TypeVar
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class AccessConfig(EnhancedDataClassJsonMixin):
|
|
10
|
+
"""Meant to designate holding any sensitive information associated with other configs
|
|
11
|
+
and also for access specific configs."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
AccessConfigT = TypeVar("AccessConfigT", bound=AccessConfig)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ConnectionConfig(EnhancedDataClassJsonMixin):
|
|
19
|
+
access_config: AccessConfigT
|
|
20
|
+
|
|
21
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
22
|
+
if not self.access_config:
|
|
23
|
+
return {}
|
|
24
|
+
return self.access_config.to_dict(apply_name_overload=False)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class BaseConnector(ABC):
|
|
32
|
+
connection_config: ConnectionConfigT
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Optional, TypedDict, TypeVar, Union
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
8
|
+
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
10
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DownloaderConfig(EnhancedDataClassJsonMixin):
|
|
15
|
+
download_dir: Optional[Path] = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DownloadResponse(TypedDict):
|
|
22
|
+
file_data: FileData
|
|
23
|
+
path: Path
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
download_responses = Union[list[DownloadResponse], DownloadResponse]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Downloader(BaseProcess, BaseConnector, ABC):
|
|
30
|
+
connector_type: str
|
|
31
|
+
download_config: DownloaderConfigT
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def is_float(value: str):
|
|
35
|
+
try:
|
|
36
|
+
float(value)
|
|
37
|
+
return True
|
|
38
|
+
except ValueError:
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
def generate_download_response(
|
|
42
|
+
self, file_data: FileData, download_path: Path
|
|
43
|
+
) -> DownloadResponse:
|
|
44
|
+
if (
|
|
45
|
+
file_data.metadata.date_modified
|
|
46
|
+
and self.is_float(file_data.metadata.date_modified)
|
|
47
|
+
and file_data.metadata.date_created
|
|
48
|
+
and self.is_float(file_data.metadata.date_created)
|
|
49
|
+
):
|
|
50
|
+
date_modified = float(file_data.metadata.date_modified)
|
|
51
|
+
date_created = float(file_data.metadata.date_created)
|
|
52
|
+
os.utime(download_path, times=(date_created, date_modified))
|
|
53
|
+
return DownloadResponse(file_data=file_data, path=download_path)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def download_dir(self) -> Path:
|
|
57
|
+
if self.download_config.download_dir is None:
|
|
58
|
+
self.download_config.download_dir = (
|
|
59
|
+
Path.home()
|
|
60
|
+
/ ".cache"
|
|
61
|
+
/ "unstructured"
|
|
62
|
+
/ "ingest"
|
|
63
|
+
/ "download"
|
|
64
|
+
/ self.connector_type
|
|
65
|
+
).resolve()
|
|
66
|
+
return self.download_config.download_dir
|
|
67
|
+
|
|
68
|
+
def is_async(self) -> bool:
|
|
69
|
+
return True
|
|
70
|
+
|
|
71
|
+
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
79
|
+
return self.run(file_data=file_data, **kwargs)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Literal, Optional
|
|
5
|
+
|
|
6
|
+
from dataclasses_json import DataClassJsonMixin
|
|
7
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class SourceIdentifiers:
|
|
12
|
+
filename: str
|
|
13
|
+
fullpath: str
|
|
14
|
+
rel_path: Optional[str] = None
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def filename_stem(self) -> str:
|
|
18
|
+
return Path(self.filename).stem
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def relative_path(self) -> str:
|
|
22
|
+
return self.rel_path or self.fullpath
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class FileData(DataClassJsonMixin):
|
|
27
|
+
identifier: str
|
|
28
|
+
connector_type: str
|
|
29
|
+
source_identifiers: Optional[SourceIdentifiers] = None
|
|
30
|
+
doc_type: Literal["file", "batch"] = field(default="file")
|
|
31
|
+
metadata: DataSourceMetadata = field(default_factory=DataSourceMetadata)
|
|
32
|
+
additional_metadata: dict[str, Any] = field(default_factory=dict)
|
|
33
|
+
reprocess: bool = False
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def from_file(cls, path: str) -> "FileData":
|
|
37
|
+
path = Path(path).resolve()
|
|
38
|
+
if not path.exists() or not path.is_file():
|
|
39
|
+
raise ValueError(f"file path not valid: {path}")
|
|
40
|
+
with open(str(path.resolve()), "rb") as f:
|
|
41
|
+
file_data_dict = json.load(f)
|
|
42
|
+
file_data = FileData.from_dict(file_data_dict)
|
|
43
|
+
return file_data
|
|
44
|
+
|
|
45
|
+
def to_file(self, path: str) -> None:
|
|
46
|
+
path = Path(path).resolve()
|
|
47
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
with open(str(path.resolve()), "w") as f:
|
|
49
|
+
json.dump(self.to_dict(), f, indent=2)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, Generator, Optional, TypeVar
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
6
|
+
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
7
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
8
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class IndexerConfig(EnhancedDataClassJsonMixin):
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
IndexerConfigT = TypeVar("IndexerConfigT", bound=IndexerConfig)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Indexer(BaseProcess, BaseConnector, ABC):
|
|
20
|
+
connector_type: str
|
|
21
|
+
index_config: Optional[IndexerConfigT] = None
|
|
22
|
+
|
|
23
|
+
def is_async(self) -> bool:
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
28
|
+
pass
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class BaseProcess(ABC):
|
|
8
|
+
def is_async(self) -> bool:
|
|
9
|
+
return False
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def run(self, **kwargs: Any) -> Any:
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
async def run_async(self, **kwargs: Any) -> Any:
|
|
16
|
+
return self.run(**kwargs)
|
|
17
|
+
|
|
18
|
+
def check_connection(self):
|
|
19
|
+
# If the process requires external connections, run a quick check
|
|
20
|
+
pass
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from asyncio import Semaphore
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
8
|
+
|
|
9
|
+
DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ProcessorConfig(EnhancedDataClassJsonMixin):
|
|
14
|
+
reprocess: bool = False
|
|
15
|
+
verbose: bool = False
|
|
16
|
+
tqdm: bool = False
|
|
17
|
+
work_dir: str = field(default_factory=lambda: DEFAULT_WORK_DIR)
|
|
18
|
+
num_processes: int = 2
|
|
19
|
+
max_connections: Optional[int] = None
|
|
20
|
+
raise_on_error: bool = False
|
|
21
|
+
disable_parallelism: bool = field(
|
|
22
|
+
default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
|
|
23
|
+
)
|
|
24
|
+
preserve_downloads: bool = False
|
|
25
|
+
download_only: bool = False
|
|
26
|
+
max_docs: Optional[int] = None
|
|
27
|
+
re_download: bool = False
|
|
28
|
+
uncompress: bool = False
|
|
29
|
+
|
|
30
|
+
# Used to keep track of state in pipeline
|
|
31
|
+
status: dict = field(default_factory=dict)
|
|
32
|
+
semaphore: Optional[Semaphore] = field(init=False, default=None)
|
|
33
|
+
|
|
34
|
+
def __post_init__(self):
|
|
35
|
+
if self.max_connections is not None:
|
|
36
|
+
self.semaphore = Semaphore(self.max_connections)
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def mp_supported(self) -> bool:
|
|
40
|
+
return not self.disable_parallelism and self.num_processes > 1
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def async_supported(self) -> bool:
|
|
44
|
+
if self.disable_parallelism:
|
|
45
|
+
return False
|
|
46
|
+
if self.max_connections is not None and isinstance(self.max_connections, int):
|
|
47
|
+
return self.max_connections > 1
|
|
48
|
+
return True
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
7
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
8
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class UploadStagerConfig(EnhancedDataClassJsonMixin):
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class UploadStager(BaseProcess, ABC):
|
|
21
|
+
upload_stager_config: UploadStagerConfigT
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def run(
|
|
25
|
+
self,
|
|
26
|
+
elements_filepath: Path,
|
|
27
|
+
file_data: FileData,
|
|
28
|
+
output_dir: Path,
|
|
29
|
+
output_filename: str,
|
|
30
|
+
**kwargs: Any
|
|
31
|
+
) -> Path:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
async def run_async(
|
|
35
|
+
self,
|
|
36
|
+
elements_filepath: Path,
|
|
37
|
+
file_data: FileData,
|
|
38
|
+
output_dir: Path,
|
|
39
|
+
output_filename: str,
|
|
40
|
+
**kwargs: Any
|
|
41
|
+
) -> Path:
|
|
42
|
+
return self.run(
|
|
43
|
+
elements_filepath=elements_filepath,
|
|
44
|
+
output_dir=output_dir,
|
|
45
|
+
output_filename=output_filename,
|
|
46
|
+
file_data=file_data,
|
|
47
|
+
**kwargs
|
|
48
|
+
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
7
|
+
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
8
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class UploaderConfig(EnhancedDataClassJsonMixin):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
UploaderConfigT = TypeVar("UploaderConfigT", bound=UploaderConfig)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class UploadContent:
|
|
22
|
+
path: Path
|
|
23
|
+
file_data: FileData
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class Uploader(BaseProcess, BaseConnector, ABC):
|
|
28
|
+
upload_config: UploaderConfigT
|
|
29
|
+
connector_type: str
|
|
30
|
+
|
|
31
|
+
def is_async(self) -> bool:
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
39
|
+
return self.run(contents=[UploadContent(path=path, file_data=file_data)], **kwargs)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from logging import Formatter, Logger, StreamHandler, getLevelName, getLogger
|
|
5
|
+
from typing import Any, Callable
|
|
6
|
+
|
|
7
|
+
log_level = os.getenv("INGEST_LOG_LEVEL", "INFO")
|
|
8
|
+
LOGGER_NAME = "unstructured.ingest.v2"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def default_is_data_sensitive(k: str, v: Any) -> bool:
|
|
12
|
+
sensitive_fields = [
|
|
13
|
+
"account_name",
|
|
14
|
+
"client_id",
|
|
15
|
+
]
|
|
16
|
+
sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
|
|
17
|
+
return (
|
|
18
|
+
v
|
|
19
|
+
and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
|
|
20
|
+
or k.lower() in sensitive_fields
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def hide_sensitive_fields(
|
|
25
|
+
data: dict, is_sensitive_fn: Callable[[str, Any], bool] = default_is_data_sensitive
|
|
26
|
+
) -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Will recursively look through every k, v pair in this dict and any nested ones and run
|
|
29
|
+
is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
|
|
30
|
+
any string value can be parsed as valid json and process that dict as well and replace
|
|
31
|
+
the original string with the json.dumps() version of the redacted dict.
|
|
32
|
+
"""
|
|
33
|
+
new_data = data.copy()
|
|
34
|
+
for k, v in new_data.items():
|
|
35
|
+
if is_sensitive_fn(k, v):
|
|
36
|
+
new_data[k] = "*******"
|
|
37
|
+
if isinstance(v, dict):
|
|
38
|
+
new_data[k] = hide_sensitive_fields(v)
|
|
39
|
+
if isinstance(v, str):
|
|
40
|
+
# Need to take into account strings generated via json.dumps() or simply printing a dict
|
|
41
|
+
try:
|
|
42
|
+
json_data = json.loads(v)
|
|
43
|
+
if isinstance(json_data, dict):
|
|
44
|
+
updated_data = hide_sensitive_fields(json_data)
|
|
45
|
+
new_data[k] = json.dumps(updated_data)
|
|
46
|
+
except json.JSONDecodeError:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
return new_data
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def redact_jsons(s: str) -> str:
|
|
53
|
+
"""
|
|
54
|
+
Takes in a generic string and pulls out all valid json content. Leverages
|
|
55
|
+
hide_sensitive_fields() to redact any sensitive information and replaces the
|
|
56
|
+
original json with the new redacted format. There can be any number of valid
|
|
57
|
+
jsons in a generic string and this will work. Having extra '{' without a
|
|
58
|
+
closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
chars = list(s)
|
|
62
|
+
if "{" not in chars:
|
|
63
|
+
return s
|
|
64
|
+
i = 0
|
|
65
|
+
jsons = []
|
|
66
|
+
i = 0
|
|
67
|
+
while i < len(chars):
|
|
68
|
+
char = chars[i]
|
|
69
|
+
if char == "{":
|
|
70
|
+
stack = [char]
|
|
71
|
+
current = [char]
|
|
72
|
+
while len(stack) != 0 and i < len(chars):
|
|
73
|
+
i += 1
|
|
74
|
+
char = chars[i]
|
|
75
|
+
current.append(char)
|
|
76
|
+
if char == "{":
|
|
77
|
+
stack.append(char)
|
|
78
|
+
if char == "}":
|
|
79
|
+
stack.pop(-1)
|
|
80
|
+
jsons.append("".join(current))
|
|
81
|
+
continue
|
|
82
|
+
i += 1
|
|
83
|
+
for j in jsons:
|
|
84
|
+
try:
|
|
85
|
+
formatted_j = json.dumps(json.loads(j))
|
|
86
|
+
except json.JSONDecodeError:
|
|
87
|
+
lit = ast.literal_eval(j)
|
|
88
|
+
formatted_j = json.dumps(lit)
|
|
89
|
+
hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
|
|
90
|
+
s = s.replace(j, hidden_j)
|
|
91
|
+
return s
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class SensitiveFormatter(Formatter):
|
|
95
|
+
def format(self, record):
|
|
96
|
+
s = super().format(record=record)
|
|
97
|
+
try:
|
|
98
|
+
return redact_jsons(s)
|
|
99
|
+
except Exception:
|
|
100
|
+
return f"Failed to redact: {s}"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def remove_root_handlers(logger: Logger) -> None:
|
|
104
|
+
# NOTE(robinson) - in some environments such as Google Colab, there is a root handler
|
|
105
|
+
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
106
|
+
# Removing these when they exist prevents this behavior
|
|
107
|
+
if logger.root.hasHandlers():
|
|
108
|
+
for handler in logger.root.handlers:
|
|
109
|
+
logger.root.removeHandler(handler)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def make_default_logger(level: int) -> Logger:
|
|
113
|
+
"""Return a custom logger."""
|
|
114
|
+
logger = getLogger(LOGGER_NAME)
|
|
115
|
+
handler = StreamHandler()
|
|
116
|
+
handler.name = "ingest_log_handler"
|
|
117
|
+
formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
118
|
+
handler.setFormatter(formatter)
|
|
119
|
+
if handler.name not in [h.name for h in logger.handlers]:
|
|
120
|
+
logger.addHandler(handler)
|
|
121
|
+
logger.setLevel(level)
|
|
122
|
+
remove_root_handlers(logger)
|
|
123
|
+
return logger
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
logger = make_default_logger(level=getLevelName(log_level.upper()))
|
|
File without changes
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import multiprocessing as mp
|
|
4
|
+
from abc import ABC
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from functools import wraps
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from time import time
|
|
10
|
+
from typing import Any, Awaitable, Callable, Optional, TypeVar
|
|
11
|
+
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
from tqdm.asyncio import tqdm as tqdm_asyncio
|
|
14
|
+
|
|
15
|
+
from unstructured_ingest.v2.interfaces import BaseProcess, ProcessorConfig
|
|
16
|
+
from unstructured_ingest.v2.logger import logger, make_default_logger
|
|
17
|
+
|
|
18
|
+
BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess)
|
|
19
|
+
iterable_input = list[dict[str, Any]]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def timed(func):
|
|
23
|
+
@wraps(func)
|
|
24
|
+
def time_it(self, *args, **kwargs):
|
|
25
|
+
start = time()
|
|
26
|
+
try:
|
|
27
|
+
return func(self, *args, **kwargs)
|
|
28
|
+
finally:
|
|
29
|
+
if func.__name__ == "__call__":
|
|
30
|
+
reported_name = f"{self.__class__.__name__} [cls]"
|
|
31
|
+
else:
|
|
32
|
+
reported_name = func.__name__
|
|
33
|
+
logger.info(f"{reported_name} took {time() - start} seconds")
|
|
34
|
+
|
|
35
|
+
return time_it
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class PipelineStep(ABC):
|
|
40
|
+
process: BaseProcessT
|
|
41
|
+
context: ProcessorConfig
|
|
42
|
+
identifier: str
|
|
43
|
+
|
|
44
|
+
def __str__(self):
|
|
45
|
+
return self.identifier
|
|
46
|
+
|
|
47
|
+
def process_serially(self, iterable: iterable_input) -> Any:
|
|
48
|
+
logger.info("processing content serially")
|
|
49
|
+
if iterable:
|
|
50
|
+
if len(iterable) == 1:
|
|
51
|
+
return [self.run(**iterable[0])]
|
|
52
|
+
if self.context.tqdm:
|
|
53
|
+
return [self.run(**it) for it in tqdm(iterable, desc=self.identifier)]
|
|
54
|
+
return [self.run(**it) for it in iterable]
|
|
55
|
+
return [self.run()]
|
|
56
|
+
|
|
57
|
+
async def _process_async(self, iterable: iterable_input) -> Any:
|
|
58
|
+
if iterable:
|
|
59
|
+
if len(iterable) == 1:
|
|
60
|
+
return [await self.run_async(**iterable[0])]
|
|
61
|
+
if self.context.tqdm:
|
|
62
|
+
return await tqdm_asyncio.gather(
|
|
63
|
+
*[self.run_async(**i) for i in iterable], desc=self.identifier
|
|
64
|
+
)
|
|
65
|
+
return await asyncio.gather(*[self.run_async(**i) for i in iterable])
|
|
66
|
+
return [await self.run_async()]
|
|
67
|
+
|
|
68
|
+
def process_async(self, iterable: iterable_input) -> Any:
|
|
69
|
+
logger.info("processing content async")
|
|
70
|
+
return self.asyncio_run(fn=self._process_async, iterable=iterable)
|
|
71
|
+
|
|
72
|
+
def asyncio_run(
|
|
73
|
+
self, fn: Callable[[Any, Any], Awaitable[Any]], *args: Any, **kwargs: Any
|
|
74
|
+
) -> Any:
|
|
75
|
+
current_loop = asyncio._get_running_loop()
|
|
76
|
+
if current_loop is None:
|
|
77
|
+
return asyncio.run(fn(*args, **kwargs))
|
|
78
|
+
with ThreadPoolExecutor(thread_name_prefix="asyncio") as thread_pool:
|
|
79
|
+
logger.warning(
|
|
80
|
+
f"async code being run in dedicated thread pool "
|
|
81
|
+
f"to not conflict with existing event loop: {current_loop}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def wrapped():
|
|
85
|
+
return asyncio.run(fn(*args, **kwargs))
|
|
86
|
+
|
|
87
|
+
future = thread_pool.submit(wrapped)
|
|
88
|
+
return future.result()
|
|
89
|
+
|
|
90
|
+
def process_multiprocess(self, iterable: iterable_input) -> Any:
|
|
91
|
+
logger.info("processing content across processes")
|
|
92
|
+
|
|
93
|
+
if iterable:
|
|
94
|
+
if len(iterable) == 1:
|
|
95
|
+
return [self.process_serially(iterable)]
|
|
96
|
+
if self.context.num_processes == 1:
|
|
97
|
+
return self.process_serially(iterable)
|
|
98
|
+
with mp.Pool(
|
|
99
|
+
processes=self.context.num_processes,
|
|
100
|
+
initializer=self._init_logger,
|
|
101
|
+
initargs=(logging.DEBUG if self.context.verbose else logging.INFO,),
|
|
102
|
+
) as pool:
|
|
103
|
+
if self.context.tqdm:
|
|
104
|
+
return list(
|
|
105
|
+
tqdm(
|
|
106
|
+
pool.imap_unordered(func=self._wrap_mp, iterable=iterable),
|
|
107
|
+
total=len(iterable),
|
|
108
|
+
desc=self.identifier,
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
return pool.map(self._wrap_mp, iterable)
|
|
112
|
+
return [self.run()]
|
|
113
|
+
|
|
114
|
+
def _wrap_mp(self, input_kwargs: dict) -> Any:
|
|
115
|
+
# Allow mapping of kwargs via multiprocessing map()
|
|
116
|
+
return self.run(**input_kwargs)
|
|
117
|
+
|
|
118
|
+
def _init_logger(self, log_level: int):
|
|
119
|
+
# Init logger for each spawned process when using multiprocessing pool
|
|
120
|
+
make_default_logger(level=log_level)
|
|
121
|
+
|
|
122
|
+
@timed
|
|
123
|
+
def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
|
|
124
|
+
iterable = iterable or []
|
|
125
|
+
if iterable:
|
|
126
|
+
logger.info(
|
|
127
|
+
f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
128
|
+
)
|
|
129
|
+
if self.context.async_supported and self.process.is_async():
|
|
130
|
+
return self.process_async(iterable=iterable)
|
|
131
|
+
if self.context.mp_supported:
|
|
132
|
+
return self.process_multiprocess(iterable=iterable)
|
|
133
|
+
return self.process_serially(iterable=iterable)
|
|
134
|
+
|
|
135
|
+
def _run(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
|
|
136
|
+
return self.asyncio_run(fn=self.run_async, _fn=fn, **kwargs)
|
|
137
|
+
|
|
138
|
+
async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
|
|
139
|
+
raise NotImplementedError
|
|
140
|
+
|
|
141
|
+
def run(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
|
|
142
|
+
try:
|
|
143
|
+
fn = _fn or self.process.run
|
|
144
|
+
return self._run(fn=fn, **kwargs)
|
|
145
|
+
except Exception as e:
|
|
146
|
+
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
147
|
+
if "file_data_path" in kwargs:
|
|
148
|
+
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
149
|
+
if self.context.raise_on_error:
|
|
150
|
+
raise e
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
|
|
154
|
+
try:
|
|
155
|
+
fn = _fn or self.process.run_async
|
|
156
|
+
return await self._run_async(fn=fn, **kwargs)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
159
|
+
if "file_data_path" in kwargs:
|
|
160
|
+
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
161
|
+
if self.context.raise_on_error:
|
|
162
|
+
raise e
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def cache_dir(self) -> Path:
|
|
167
|
+
return Path(self.context.work_dir) / self.identifier
|