unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.fsspec.azure import (
|
|
9
|
+
AzureWriteConfig,
|
|
10
|
+
SimpleAzureBlobStorageConfig,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class AzureWriter(Writer):
|
|
16
|
+
connector_config: "SimpleAzureBlobStorageConfig"
|
|
17
|
+
write_config: "AzureWriteConfig"
|
|
18
|
+
|
|
19
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
20
|
+
from unstructured_ingest.connector.fsspec.azure import (
|
|
21
|
+
AzureBlobStorageDestinationConnector,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
return AzureBlobStorageDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.fsspec.box import BoxWriteConfig, SimpleBoxConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class BoxWriter(Writer):
|
|
13
|
+
connector_config: "SimpleBoxConfig"
|
|
14
|
+
write_config: "BoxWriteConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.fsspec.box import (
|
|
18
|
+
BoxDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return BoxDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.fsspec.dropbox import DropboxWriteConfig, SimpleDropboxConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class DropboxWriter(Writer):
|
|
13
|
+
connector_config: "SimpleDropboxConfig"
|
|
14
|
+
write_config: "DropboxWriteConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.fsspec.dropbox import (
|
|
18
|
+
DropboxDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return DropboxDestinationConnector
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.fsspec.gcs import GcsWriteConfig, SimpleGcsConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class GcsWriter(Writer):
|
|
13
|
+
connector_config: "SimpleGcsConfig"
|
|
14
|
+
write_config: "GcsWriteConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.fsspec.gcs import GcsDestinationConnector
|
|
18
|
+
|
|
19
|
+
return GcsDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.fsspec.s3 import S3WriteConfig, SimpleS3Config
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class S3Writer(Writer):
|
|
13
|
+
connector_config: "SimpleS3Config"
|
|
14
|
+
write_config: "S3WriteConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.fsspec.s3 import (
|
|
18
|
+
S3DestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return S3DestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.kafka import KafkaWriteConfig, SimpleKafkaConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class KafkaWriter(Writer):
|
|
13
|
+
write_config: "KafkaWriteConfig"
|
|
14
|
+
connector_config: "SimpleKafkaConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.kafka import (
|
|
18
|
+
KafkaDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return KafkaDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.mongodb import MongoDBWriteConfig, SimpleMongoDBConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class MongodbWriter(Writer):
|
|
13
|
+
write_config: "MongoDBWriteConfig"
|
|
14
|
+
connector_config: "SimpleMongoDBConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.mongodb import (
|
|
18
|
+
MongoDBDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return MongoDBDestinationConnector
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.elasticsearch import (
|
|
9
|
+
ElasticsearchWriteConfig,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.connector.opensearch import (
|
|
12
|
+
SimpleOpenSearchConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class OpenSearchWriter(Writer):
|
|
18
|
+
connector_config: "SimpleOpenSearchConfig"
|
|
19
|
+
write_config: "ElasticsearchWriteConfig"
|
|
20
|
+
|
|
21
|
+
def get_connector_cls(self) -> BaseDestinationConnector:
|
|
22
|
+
from unstructured_ingest.connector.opensearch import (
|
|
23
|
+
OpenSearchDestinationConnector,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
return OpenSearchDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.pinecone import PineconeWriteConfig, SimplePineconeConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class PineconeWriter(Writer):
|
|
13
|
+
write_config: "PineconeWriteConfig"
|
|
14
|
+
connector_config: "SimplePineconeConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.pinecone import (
|
|
18
|
+
PineconeDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return PineconeDestinationConnector
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.qdrant import QdrantWriteConfig, SimpleQdrantConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class QdrantWriter(Writer):
|
|
13
|
+
write_config: "QdrantWriteConfig"
|
|
14
|
+
connector_config: "SimpleQdrantConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.qdrant import QdrantDestinationConnector
|
|
18
|
+
|
|
19
|
+
return QdrantDestinationConnector
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.sql import SimpleSqlConfig
|
|
9
|
+
from unstructured_ingest.interfaces import WriteConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class SqlWriter(Writer):
|
|
14
|
+
write_config: "WriteConfig"
|
|
15
|
+
connector_config: "SimpleSqlConfig"
|
|
16
|
+
|
|
17
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
18
|
+
from unstructured_ingest.connector.sql import (
|
|
19
|
+
SqlDestinationConnector,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
return SqlDestinationConnector
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
5
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
6
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
7
|
+
|
|
8
|
+
if t.TYPE_CHECKING:
|
|
9
|
+
from unstructured_ingest.connector.vectara import SimpleVectaraConfig, VectaraWriteConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class VectaraWriter(Writer, EnhancedDataClassJsonMixin):
|
|
14
|
+
write_config: "VectaraWriteConfig"
|
|
15
|
+
connector_config: "SimpleVectaraConfig"
|
|
16
|
+
|
|
17
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
18
|
+
from unstructured_ingest.connector.vectara import (
|
|
19
|
+
VectaraDestinationConnector,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
return VectaraDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.weaviate import SimpleWeaviateConfig, WeaviateWriteConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class WeaviateWriter(Writer):
|
|
13
|
+
write_config: "WeaviateWriteConfig"
|
|
14
|
+
connector_config: "SimpleWeaviateConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.weaviate import (
|
|
18
|
+
WeaviateDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return WeaviateDestinationConnector
|
|
File without changes
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import tarfile
|
|
5
|
+
import zipfile
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.connector.local import LocalSourceConnector, SimpleLocalConfig
|
|
11
|
+
from unstructured_ingest.interfaces import (
|
|
12
|
+
BaseConnectorConfig,
|
|
13
|
+
BaseSingleIngestDoc,
|
|
14
|
+
ProcessorConfig,
|
|
15
|
+
ReadConfig,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.logger import logger
|
|
18
|
+
|
|
19
|
+
ZIP_FILE_EXT = [".zip"]
|
|
20
|
+
TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def uncompress_file(filename: str, path: Optional[str] = None) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Takes in a compressed zip or tar file and uncompresses it
|
|
26
|
+
"""
|
|
27
|
+
# Create path if it doesn't already exist
|
|
28
|
+
if path:
|
|
29
|
+
Path(path).mkdir(parents=True, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
if any(filename.endswith(ext) for ext in ZIP_FILE_EXT):
|
|
32
|
+
return uncompress_zip_file(zip_filename=filename, path=path)
|
|
33
|
+
elif any(filename.endswith(ext) for ext in TAR_FILE_EXT):
|
|
34
|
+
return uncompress_tar_file(tar_filename=filename, path=path)
|
|
35
|
+
else:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
"filename {} not a recognized compressed extension: {}".format(
|
|
38
|
+
filename,
|
|
39
|
+
", ".join(ZIP_FILE_EXT + TAR_FILE_EXT),
|
|
40
|
+
),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def uncompress_zip_file(zip_filename: str, path: Optional[str] = None) -> str:
|
|
45
|
+
head, tail = os.path.split(zip_filename)
|
|
46
|
+
for ext in ZIP_FILE_EXT:
|
|
47
|
+
if tail.endswith(ext):
|
|
48
|
+
tail = tail[: -(len(ext))]
|
|
49
|
+
break
|
|
50
|
+
path = path if path else os.path.join(head, f"{tail}-zip-uncompressed")
|
|
51
|
+
logger.info(f"extracting zip {zip_filename} -> {path}")
|
|
52
|
+
with zipfile.ZipFile(zip_filename) as zfile:
|
|
53
|
+
zfile.extractall(path=path)
|
|
54
|
+
return path
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
|
|
58
|
+
head, tail = os.path.split(tar_filename)
|
|
59
|
+
for ext in TAR_FILE_EXT:
|
|
60
|
+
if tail.endswith(ext):
|
|
61
|
+
tail = tail[: -(len(ext))]
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
|
|
65
|
+
logger.info(f"extracting tar {tar_filename} -> {path}")
|
|
66
|
+
with tarfile.open(tar_filename, "r:gz") as tfile:
|
|
67
|
+
# NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
|
|
68
|
+
# This was added in Python 3.12
|
|
69
|
+
# Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
|
|
70
|
+
if sys.version_info >= (3, 12):
|
|
71
|
+
tfile.extraction_filter = tarfile.tar_filter
|
|
72
|
+
else:
|
|
73
|
+
logger.warning(
|
|
74
|
+
"Extraction filtering for tar files is available for Python 3.12 and above. "
|
|
75
|
+
"Consider upgrading your Python version to improve security. "
|
|
76
|
+
"See https://docs.python.org/3/library/tarfile.html#extraction-filters"
|
|
77
|
+
)
|
|
78
|
+
tfile.extractall(path=path)
|
|
79
|
+
return path
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class CompressionSourceConnectorMixin:
|
|
84
|
+
processor_config: ProcessorConfig
|
|
85
|
+
read_config: ReadConfig
|
|
86
|
+
connector_config: BaseConnectorConfig
|
|
87
|
+
|
|
88
|
+
def process_compressed_doc(self, doc: BaseSingleIngestDoc) -> List[BaseSingleIngestDoc]:
|
|
89
|
+
"""
|
|
90
|
+
Utility function which helps process compressed files. Extracts the contents and returns
|
|
91
|
+
generated ingest docs via local source connector
|
|
92
|
+
"""
|
|
93
|
+
# Download the raw file to local
|
|
94
|
+
doc.get_file()
|
|
95
|
+
path = uncompress_file(filename=str(doc.filename))
|
|
96
|
+
new_read_configs = copy.copy(self.read_config)
|
|
97
|
+
new_process_configs = copy.copy(self.processor_config)
|
|
98
|
+
relative_path = path.replace(self.read_config.download_dir, "")
|
|
99
|
+
|
|
100
|
+
if self.processor_config.output_dir.endswith(os.sep):
|
|
101
|
+
new_process_configs.output_dir = f"{self.processor_config.output_dir}{relative_path}"
|
|
102
|
+
else:
|
|
103
|
+
new_process_configs.output_dir = (
|
|
104
|
+
f"{self.processor_config.output_dir}{os.sep}{relative_path}"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
local_connector = LocalSourceConnector(
|
|
108
|
+
connector_config=SimpleLocalConfig(
|
|
109
|
+
input_path=path,
|
|
110
|
+
recursive=True,
|
|
111
|
+
),
|
|
112
|
+
read_config=new_read_configs,
|
|
113
|
+
processor_config=new_process_configs,
|
|
114
|
+
)
|
|
115
|
+
logger.info(f"Created local source connector: {local_connector.to_json()}")
|
|
116
|
+
local_connector.initialize()
|
|
117
|
+
return local_connector.get_ingest_docs()
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import json
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Optional, Sequence, cast
|
|
5
|
+
|
|
6
|
+
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def batch_generator(iterable, batch_size=100):
|
|
10
|
+
"""A helper function to break an iterable into batches of size batch_size."""
|
|
11
|
+
it = iter(iterable)
|
|
12
|
+
chunk = tuple(itertools.islice(it, batch_size))
|
|
13
|
+
while chunk:
|
|
14
|
+
yield chunk
|
|
15
|
+
chunk = tuple(itertools.islice(it, batch_size))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def generator_batching_wbytes(iterable, batch_size_limit_bytes=15_000_000):
|
|
19
|
+
"""A helper function to break an iterable into chunks of specified bytes."""
|
|
20
|
+
current_batch, current_batch_size = [], 0
|
|
21
|
+
|
|
22
|
+
for item in iterable:
|
|
23
|
+
item_size_bytes = len(json.dumps(item).encode("utf-8"))
|
|
24
|
+
|
|
25
|
+
if current_batch_size + item_size_bytes <= batch_size_limit_bytes:
|
|
26
|
+
current_batch.append(item)
|
|
27
|
+
current_batch_size += item_size_bytes
|
|
28
|
+
else:
|
|
29
|
+
yield current_batch
|
|
30
|
+
current_batch, current_batch_size = [item], item_size_bytes
|
|
31
|
+
|
|
32
|
+
if current_batch:
|
|
33
|
+
yield current_batch
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def flatten_dict(
|
|
37
|
+
dictionary: dict[str, Any],
|
|
38
|
+
parent_key: str = "",
|
|
39
|
+
separator: str = "_",
|
|
40
|
+
flatten_lists: bool = False,
|
|
41
|
+
remove_none: bool = False,
|
|
42
|
+
keys_to_omit: Optional[Sequence[str]] = None,
|
|
43
|
+
) -> dict[str, Any]:
|
|
44
|
+
"""Flattens a nested dictionary into a single level dictionary.
|
|
45
|
+
|
|
46
|
+
keys_to_omit is a list of keys that don't get flattened. If omitting a nested key, format as
|
|
47
|
+
{parent_key}{separator}{key}. If flatten_lists is True, then lists and tuples are flattened as
|
|
48
|
+
well. If remove_none is True, then None keys/values are removed from the flattened
|
|
49
|
+
dictionary.
|
|
50
|
+
"""
|
|
51
|
+
keys_to_omit = keys_to_omit if keys_to_omit else []
|
|
52
|
+
flattened_dict: dict[str, Any] = {}
|
|
53
|
+
for key, value in dictionary.items():
|
|
54
|
+
new_key = f"{parent_key}{separator}{key}" if parent_key else key
|
|
55
|
+
if new_key in keys_to_omit:
|
|
56
|
+
flattened_dict[new_key] = value
|
|
57
|
+
elif value is None and remove_none:
|
|
58
|
+
continue
|
|
59
|
+
elif isinstance(value, dict):
|
|
60
|
+
value = cast("dict[str, Any]", value)
|
|
61
|
+
flattened_dict.update(
|
|
62
|
+
flatten_dict(
|
|
63
|
+
value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
|
|
64
|
+
),
|
|
65
|
+
)
|
|
66
|
+
elif isinstance(value, (list, tuple)) and flatten_lists:
|
|
67
|
+
value = cast("list[Any] | tuple[Any]", value)
|
|
68
|
+
for index, item in enumerate(value):
|
|
69
|
+
flattened_dict.update(
|
|
70
|
+
flatten_dict(
|
|
71
|
+
{f"{new_key}{separator}{index}": item},
|
|
72
|
+
"",
|
|
73
|
+
separator,
|
|
74
|
+
flatten_lists,
|
|
75
|
+
remove_none,
|
|
76
|
+
keys_to_omit=keys_to_omit,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
flattened_dict[new_key] = value
|
|
81
|
+
|
|
82
|
+
return flattened_dict
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def validate_date_args(date: Optional[str] = None) -> bool:
|
|
86
|
+
"""Validate whether the provided date string satisfies any of the supported date formats.
|
|
87
|
+
|
|
88
|
+
Used by unstructured/ingest/connector/biomed.py
|
|
89
|
+
|
|
90
|
+
Returns `True` if the date string satisfies any of the supported formats, otherwise raises
|
|
91
|
+
`ValueError`.
|
|
92
|
+
|
|
93
|
+
Supported Date Formats:
|
|
94
|
+
- 'YYYY-MM-DD'
|
|
95
|
+
- 'YYYY-MM-DDTHH:MM:SS'
|
|
96
|
+
- 'YYYY-MM-DD+HH:MM:SS'
|
|
97
|
+
- 'YYYY-MM-DDTHH:MM:SS±HHMM'
|
|
98
|
+
"""
|
|
99
|
+
if not date:
|
|
100
|
+
raise ValueError("The argument date is None.")
|
|
101
|
+
|
|
102
|
+
for format in DATE_FORMATS:
|
|
103
|
+
try:
|
|
104
|
+
datetime.strptime(date, format)
|
|
105
|
+
return True
|
|
106
|
+
except ValueError:
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
raise ValueError(
|
|
110
|
+
f"The argument {date} does not satisfy the format:"
|
|
111
|
+
f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
|
|
112
|
+
)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import importlib
|
|
5
|
+
from functools import wraps
|
|
6
|
+
from typing import (
|
|
7
|
+
Callable,
|
|
8
|
+
List,
|
|
9
|
+
Optional,
|
|
10
|
+
TypeVar,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
from typing_extensions import ParamSpec
|
|
14
|
+
|
|
15
|
+
_T = TypeVar("_T")
|
|
16
|
+
_P = ParamSpec("_P")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def requires_dependencies(
|
|
20
|
+
dependencies: str | list[str],
|
|
21
|
+
extras: Optional[str] = None,
|
|
22
|
+
) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
|
|
23
|
+
if isinstance(dependencies, str):
|
|
24
|
+
dependencies = [dependencies]
|
|
25
|
+
|
|
26
|
+
def decorator(func: Callable[_P, _T]) -> Callable[_P, _T]:
|
|
27
|
+
def run_check():
|
|
28
|
+
missing_deps: List[str] = []
|
|
29
|
+
for dep in dependencies:
|
|
30
|
+
if not dependency_exists(dep):
|
|
31
|
+
missing_deps.append(dep)
|
|
32
|
+
if len(missing_deps) > 0:
|
|
33
|
+
raise ImportError(
|
|
34
|
+
f"Following dependencies are missing: {', '.join(missing_deps)}. "
|
|
35
|
+
+ (
|
|
36
|
+
f"""Please install them using `pip install "unstructured[{extras}]"`."""
|
|
37
|
+
if extras
|
|
38
|
+
else f"Please install them using `pip install {' '.join(missing_deps)}`."
|
|
39
|
+
),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
@wraps(func)
|
|
43
|
+
def wrapper(*args: _P.args, **kwargs: _P.kwargs):
|
|
44
|
+
run_check()
|
|
45
|
+
return func(*args, **kwargs)
|
|
46
|
+
|
|
47
|
+
@wraps(func)
|
|
48
|
+
async def wrapper_async(*args: _P.args, **kwargs: _P.kwargs):
|
|
49
|
+
run_check()
|
|
50
|
+
return await func(*args, **kwargs)
|
|
51
|
+
|
|
52
|
+
if asyncio.iscoroutinefunction(func):
|
|
53
|
+
return wrapper_async
|
|
54
|
+
return wrapper
|
|
55
|
+
|
|
56
|
+
return decorator
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def dependency_exists(dependency: str):
|
|
60
|
+
try:
|
|
61
|
+
importlib.import_module(dependency)
|
|
62
|
+
except ImportError as e:
|
|
63
|
+
# Check to make sure this isn't some unrelated import error.
|
|
64
|
+
if dependency in repr(e):
|
|
65
|
+
return False
|
|
66
|
+
return True
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import typing as t
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
from dateutil import parser
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
|
|
9
|
+
"""Helper function attempts to deserialize json string to a dictionary."""
|
|
10
|
+
try:
|
|
11
|
+
return json.loads(json_string)
|
|
12
|
+
except json.JSONDecodeError:
|
|
13
|
+
# Not neccessary an error if it is a path or malformed json
|
|
14
|
+
pass
|
|
15
|
+
try:
|
|
16
|
+
# This is common when single quotes are used instead of double quotes
|
|
17
|
+
return json.loads(json_string.replace("'", '"'))
|
|
18
|
+
except json.JSONDecodeError:
|
|
19
|
+
# Not neccessary an error if it is a path
|
|
20
|
+
pass
|
|
21
|
+
return json_string
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def ensure_isoformat_datetime(timestamp: t.Union[datetime, str]) -> str:
|
|
25
|
+
"""
|
|
26
|
+
Ensures that the input value is converted to an ISO format datetime string.
|
|
27
|
+
Handles both datetime objects and strings.
|
|
28
|
+
"""
|
|
29
|
+
if isinstance(timestamp, datetime):
|
|
30
|
+
return timestamp.isoformat()
|
|
31
|
+
elif isinstance(timestamp, str):
|
|
32
|
+
try:
|
|
33
|
+
# Parse the datetime string in various formats
|
|
34
|
+
dt = parser.parse(timestamp)
|
|
35
|
+
return dt.isoformat()
|
|
36
|
+
except ValueError as e:
|
|
37
|
+
raise ValueError(f"String '{timestamp}' could not be parsed as a datetime.") from e
|
|
38
|
+
else:
|
|
39
|
+
raise TypeError(f"Expected input type datetime or str, but got {type(timestamp)}.")
|