unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
import typing as t
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
8
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
9
|
+
from unstructured_ingest.interfaces import (
|
|
10
|
+
AccessConfig,
|
|
11
|
+
BaseConnectorConfig,
|
|
12
|
+
BaseDestinationConnector,
|
|
13
|
+
WriteConfig,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.logger import logger
|
|
16
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
+
|
|
18
|
+
if t.TYPE_CHECKING:
|
|
19
|
+
from weaviate import Client
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class WeaviateAccessConfig(AccessConfig):
|
|
24
|
+
access_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
25
|
+
refresh_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
26
|
+
api_key: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
27
|
+
client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
28
|
+
scope: t.Optional[t.List[str]] = None
|
|
29
|
+
username: t.Optional[str] = None
|
|
30
|
+
password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
31
|
+
anonymous: bool = False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class SimpleWeaviateConfig(BaseConnectorConfig):
|
|
36
|
+
access_config: WeaviateAccessConfig
|
|
37
|
+
host_url: str
|
|
38
|
+
class_name: str
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class WeaviateWriteConfig(WriteConfig):
|
|
43
|
+
batch_size: int = 100
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class WeaviateDestinationConnector(BaseDestinationConnector):
|
|
48
|
+
write_config: WeaviateWriteConfig
|
|
49
|
+
connector_config: SimpleWeaviateConfig
|
|
50
|
+
_client: t.Optional["Client"] = field(init=False, default=None)
|
|
51
|
+
|
|
52
|
+
def to_dict(self, **kwargs):
|
|
53
|
+
"""
|
|
54
|
+
The _client variable in this dataclass breaks deepcopy due to:
|
|
55
|
+
TypeError: cannot pickle '_thread.lock' object
|
|
56
|
+
When serializing, remove it, meaning client data will need to be reinitialized
|
|
57
|
+
when deserialized
|
|
58
|
+
"""
|
|
59
|
+
self_cp = copy.copy(self)
|
|
60
|
+
if hasattr(self_cp, "_client"):
|
|
61
|
+
setattr(self_cp, "_client", None)
|
|
62
|
+
return _asdict(self_cp, **kwargs)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
66
|
+
def client(self) -> "Client":
|
|
67
|
+
if self._client is None:
|
|
68
|
+
from weaviate import Client
|
|
69
|
+
|
|
70
|
+
auth = self._resolve_auth_method()
|
|
71
|
+
self._client = Client(url=self.connector_config.host_url, auth_client_secret=auth)
|
|
72
|
+
return self._client
|
|
73
|
+
|
|
74
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
75
|
+
@DestinationConnectionError.wrap
|
|
76
|
+
def initialize(self):
|
|
77
|
+
_ = self.client
|
|
78
|
+
|
|
79
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
80
|
+
def check_connection(self):
|
|
81
|
+
try:
|
|
82
|
+
_ = self.client
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
85
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
86
|
+
|
|
87
|
+
def _resolve_auth_method(self):
|
|
88
|
+
access_configs = self.connector_config.access_config
|
|
89
|
+
if access_configs.anonymous:
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
if access_configs.access_token:
|
|
93
|
+
from weaviate.auth import AuthBearerToken
|
|
94
|
+
|
|
95
|
+
return AuthBearerToken(
|
|
96
|
+
access_token=access_configs.access_token,
|
|
97
|
+
refresh_token=access_configs.refresh_token,
|
|
98
|
+
)
|
|
99
|
+
elif access_configs.api_key:
|
|
100
|
+
from weaviate.auth import AuthApiKey
|
|
101
|
+
|
|
102
|
+
return AuthApiKey(api_key=access_configs.api_key)
|
|
103
|
+
elif access_configs.client_secret:
|
|
104
|
+
from weaviate.auth import AuthClientCredentials
|
|
105
|
+
|
|
106
|
+
return AuthClientCredentials(
|
|
107
|
+
client_secret=access_configs.client_secret, scope=access_configs.scope
|
|
108
|
+
)
|
|
109
|
+
elif access_configs.username and access_configs.password:
|
|
110
|
+
from weaviate.auth import AuthClientPassword
|
|
111
|
+
|
|
112
|
+
return AuthClientPassword(
|
|
113
|
+
username=access_configs.username,
|
|
114
|
+
password=access_configs.password,
|
|
115
|
+
scope=access_configs.scope,
|
|
116
|
+
)
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
def conform_dict(self, data: dict) -> None:
|
|
120
|
+
"""
|
|
121
|
+
Updates the element dictionary to conform to the Weaviate schema
|
|
122
|
+
"""
|
|
123
|
+
from dateutil import parser
|
|
124
|
+
|
|
125
|
+
# Dict as string formatting
|
|
126
|
+
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
|
|
127
|
+
# Explicit casting otherwise fails schema type checking
|
|
128
|
+
data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
|
|
129
|
+
|
|
130
|
+
# Array of items as string formatting
|
|
131
|
+
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
132
|
+
data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
|
|
133
|
+
|
|
134
|
+
if links := data.get("metadata", {}).get("links", {}):
|
|
135
|
+
data["metadata"]["links"] = str(json.dumps(links))
|
|
136
|
+
|
|
137
|
+
if permissions_data := (
|
|
138
|
+
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
|
139
|
+
):
|
|
140
|
+
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
|
|
141
|
+
|
|
142
|
+
# Datetime formatting
|
|
143
|
+
if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
|
|
144
|
+
data["metadata"]["data_source"]["date_created"] = parser.parse(date_created).strftime(
|
|
145
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
|
|
149
|
+
data["metadata"]["data_source"]["date_modified"] = parser.parse(date_modified).strftime(
|
|
150
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
|
|
154
|
+
data["metadata"]["data_source"]["date_processed"] = parser.parse(
|
|
155
|
+
date_processed
|
|
156
|
+
).strftime(
|
|
157
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if last_modified := data.get("metadata", {}).get("last_modified", {}):
|
|
161
|
+
data["metadata"]["last_modified"] = parser.parse(last_modified).strftime(
|
|
162
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# String casting
|
|
166
|
+
if version := data.get("metadata", {}).get("data_source", {}).get("version"):
|
|
167
|
+
data["metadata"]["data_source"]["version"] = str(version)
|
|
168
|
+
|
|
169
|
+
if page_number := data.get("metadata", {}).get("page_number"):
|
|
170
|
+
data["metadata"]["page_number"] = str(page_number)
|
|
171
|
+
|
|
172
|
+
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
173
|
+
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
|
174
|
+
|
|
175
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
176
|
+
logger.info(
|
|
177
|
+
f"writing {len(elements_dict)} objects to destination "
|
|
178
|
+
f"class {self.connector_config.class_name} "
|
|
179
|
+
f"at {self.connector_config.host_url}",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
self.client.batch.configure(batch_size=self.write_config.batch_size)
|
|
183
|
+
with self.client.batch as b:
|
|
184
|
+
for e in elements_dict:
|
|
185
|
+
vector = e.pop("embeddings", None)
|
|
186
|
+
b.add_data_object(
|
|
187
|
+
e,
|
|
188
|
+
self.connector_config.class_name,
|
|
189
|
+
vector=vector,
|
|
190
|
+
)
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
6
|
+
from unstructured_ingest.interfaces import (
|
|
7
|
+
BaseConnectorConfig,
|
|
8
|
+
BaseSingleIngestDoc,
|
|
9
|
+
BaseSourceConnector,
|
|
10
|
+
IngestDocCleanupMixin,
|
|
11
|
+
SourceConnectorCleanupMixin,
|
|
12
|
+
SourceMetadata,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
|
|
17
|
+
if t.TYPE_CHECKING:
|
|
18
|
+
from wikipedia import WikipediaPage
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class SimpleWikipediaConfig(BaseConnectorConfig):
|
|
23
|
+
page_title: str
|
|
24
|
+
auto_suggest: bool = False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class WikipediaIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
29
|
+
connector_config: SimpleWikipediaConfig = field(repr=False)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
@requires_dependencies(["wikipedia"], extras="wikipedia")
|
|
33
|
+
def page(self) -> "WikipediaPage":
|
|
34
|
+
import wikipedia
|
|
35
|
+
|
|
36
|
+
return wikipedia.page(
|
|
37
|
+
self.connector_config.page_title,
|
|
38
|
+
auto_suggest=self.connector_config.auto_suggest,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def get_filename_prefix(self) -> str:
|
|
42
|
+
title: str = str(self.connector_config.page_title)
|
|
43
|
+
title = " ".join(title.split()).replace(" ", "-")
|
|
44
|
+
return title
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def filename(self) -> Path:
|
|
48
|
+
raise NotImplementedError()
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def text(self) -> str:
|
|
52
|
+
raise NotImplementedError()
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def _output_filename(self):
|
|
56
|
+
raise NotImplementedError()
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def date_created(self) -> t.Optional[str]:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def date_modified(self) -> t.Optional[str]:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
68
|
+
return {
|
|
69
|
+
"page_title": self.connector_config.page_title,
|
|
70
|
+
"page_url": self.source_metadata.source_url, # type: ignore
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def _create_full_tmp_dir_path(self):
|
|
74
|
+
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
75
|
+
|
|
76
|
+
@requires_dependencies(["wikipedia"], extras="wikipedia")
|
|
77
|
+
def update_source_metadata(self):
|
|
78
|
+
from wikipedia.exceptions import PageError
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
page = self.page
|
|
82
|
+
except PageError:
|
|
83
|
+
self.source_metadata = SourceMetadata(
|
|
84
|
+
exists=False,
|
|
85
|
+
)
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
self.source_metadata = SourceMetadata(
|
|
89
|
+
version=page.revision_id,
|
|
90
|
+
source_url=page.url,
|
|
91
|
+
exists=True,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
@SourceConnectionError.wrap
|
|
95
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
96
|
+
def get_file(self):
|
|
97
|
+
"""Fetches the "remote" doc and stores it locally on the filesystem."""
|
|
98
|
+
self._create_full_tmp_dir_path()
|
|
99
|
+
self.update_source_metadata()
|
|
100
|
+
with open(self.filename, "w", encoding="utf8") as f:
|
|
101
|
+
f.write(self.text)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class WikipediaIngestHTMLDoc(WikipediaIngestDoc):
|
|
106
|
+
registry_name: str = "wikipedia_html"
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def filename(self) -> Path:
|
|
110
|
+
return (
|
|
111
|
+
Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}.html"
|
|
112
|
+
).resolve()
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def text(self):
|
|
116
|
+
return self._get_html()
|
|
117
|
+
|
|
118
|
+
@SourceConnectionNetworkError.wrap
|
|
119
|
+
def _get_html(self):
|
|
120
|
+
return self.page.html()
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def _output_filename(self):
|
|
124
|
+
return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-html.json"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class WikipediaIngestTextDoc(WikipediaIngestDoc):
|
|
129
|
+
registry_name: str = "wikipedia_text"
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def filename(self) -> Path:
|
|
133
|
+
return (Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}.txt").resolve()
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def text(self):
|
|
137
|
+
return self._get_content()
|
|
138
|
+
|
|
139
|
+
@SourceConnectionNetworkError.wrap
|
|
140
|
+
def _get_content(self):
|
|
141
|
+
return self.page.content
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def _output_filename(self):
|
|
145
|
+
return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-txt.json"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclass
|
|
149
|
+
class WikipediaIngestSummaryDoc(WikipediaIngestDoc):
|
|
150
|
+
registry_name: str = "wikipedia_summary"
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def filename(self) -> Path:
|
|
154
|
+
return (
|
|
155
|
+
Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}-summary.txt"
|
|
156
|
+
).resolve()
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def text(self):
|
|
160
|
+
return self._get_summary()
|
|
161
|
+
|
|
162
|
+
@SourceConnectionNetworkError.wrap
|
|
163
|
+
def _get_summary(self):
|
|
164
|
+
return self.page.summary
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def _output_filename(self):
|
|
168
|
+
return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-summary.json"
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@dataclass
|
|
172
|
+
class WikipediaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
173
|
+
connector_config: SimpleWikipediaConfig
|
|
174
|
+
|
|
175
|
+
def initialize(self):
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
@requires_dependencies(["wikipedia"], extras="wikipedia")
|
|
179
|
+
def check_connection(self):
|
|
180
|
+
import wikipedia
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
wikipedia.page(
|
|
184
|
+
self.connector_config.page_title,
|
|
185
|
+
auto_suggest=self.connector_config.auto_suggest,
|
|
186
|
+
)
|
|
187
|
+
except Exception as e:
|
|
188
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
189
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
190
|
+
|
|
191
|
+
def get_ingest_docs(self):
|
|
192
|
+
return [
|
|
193
|
+
WikipediaIngestTextDoc(
|
|
194
|
+
processor_config=self.processor_config,
|
|
195
|
+
connector_config=self.connector_config,
|
|
196
|
+
read_config=self.read_config,
|
|
197
|
+
),
|
|
198
|
+
WikipediaIngestHTMLDoc(
|
|
199
|
+
processor_config=self.processor_config,
|
|
200
|
+
connector_config=self.connector_config,
|
|
201
|
+
read_config=self.read_config,
|
|
202
|
+
),
|
|
203
|
+
WikipediaIngestSummaryDoc(
|
|
204
|
+
processor_config=self.processor_config,
|
|
205
|
+
connector_config=self.connector_config,
|
|
206
|
+
read_config=self.read_config,
|
|
207
|
+
),
|
|
208
|
+
]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import _thread
|
|
2
|
+
import copy
|
|
3
|
+
import functools
|
|
4
|
+
from dataclasses import fields
|
|
5
|
+
|
|
6
|
+
from dataclasses_json.core import (
|
|
7
|
+
Collection,
|
|
8
|
+
Enum,
|
|
9
|
+
Mapping,
|
|
10
|
+
_encode_overrides,
|
|
11
|
+
_handle_undefined_parameters_safe,
|
|
12
|
+
_user_overrides_or_exts,
|
|
13
|
+
is_dataclass,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _recursive_repr(user_function):
|
|
18
|
+
# Copied from dataclasses as this method isn't exposed for importing
|
|
19
|
+
repr_running = set()
|
|
20
|
+
|
|
21
|
+
@functools.wraps(user_function)
|
|
22
|
+
def wrapper(self):
|
|
23
|
+
key = id(self), _thread.get_ident()
|
|
24
|
+
if key in repr_running:
|
|
25
|
+
return "..."
|
|
26
|
+
repr_running.add(key)
|
|
27
|
+
try:
|
|
28
|
+
result = user_function(self)
|
|
29
|
+
finally:
|
|
30
|
+
repr_running.discard(key)
|
|
31
|
+
return result
|
|
32
|
+
|
|
33
|
+
return wrapper
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _asdict(
|
|
37
|
+
obj,
|
|
38
|
+
encode_json=False,
|
|
39
|
+
redact_sensitive=False,
|
|
40
|
+
redacted_text="***REDACTED***",
|
|
41
|
+
apply_name_overload: bool = True,
|
|
42
|
+
):
|
|
43
|
+
"""
|
|
44
|
+
A re-implementation of `asdict` (based on the original in the `dataclasses`
|
|
45
|
+
source) to support arbitrary Collection and Mapping types.
|
|
46
|
+
"""
|
|
47
|
+
if is_dataclass(obj):
|
|
48
|
+
result = []
|
|
49
|
+
overrides = _user_overrides_or_exts(obj)
|
|
50
|
+
for field in fields(obj):
|
|
51
|
+
if overrides[field.name].encoder:
|
|
52
|
+
value = getattr(obj, field.name)
|
|
53
|
+
else:
|
|
54
|
+
value = _asdict(
|
|
55
|
+
getattr(obj, field.name),
|
|
56
|
+
encode_json=encode_json,
|
|
57
|
+
redact_sensitive=redact_sensitive,
|
|
58
|
+
redacted_text=redacted_text,
|
|
59
|
+
apply_name_overload=apply_name_overload,
|
|
60
|
+
)
|
|
61
|
+
if getattr(field, "sensitive", False) and redact_sensitive and value:
|
|
62
|
+
value = redacted_text
|
|
63
|
+
if getattr(field, "overload_name", None) and apply_name_overload:
|
|
64
|
+
overload_name = getattr(field, "overload_name")
|
|
65
|
+
result.append((overload_name, value))
|
|
66
|
+
else:
|
|
67
|
+
result.append((field.name, value))
|
|
68
|
+
|
|
69
|
+
result = _handle_undefined_parameters_safe(cls=obj, kvs=dict(result), usage="to")
|
|
70
|
+
return _encode_overrides(
|
|
71
|
+
dict(result), _user_overrides_or_exts(obj), encode_json=encode_json
|
|
72
|
+
)
|
|
73
|
+
elif isinstance(obj, Mapping):
|
|
74
|
+
return {
|
|
75
|
+
_asdict(
|
|
76
|
+
k,
|
|
77
|
+
encode_json=encode_json,
|
|
78
|
+
redact_sensitive=redact_sensitive,
|
|
79
|
+
redacted_text=redacted_text,
|
|
80
|
+
): _asdict(
|
|
81
|
+
v,
|
|
82
|
+
encode_json=encode_json,
|
|
83
|
+
redact_sensitive=redact_sensitive,
|
|
84
|
+
redacted_text=redacted_text,
|
|
85
|
+
)
|
|
86
|
+
for k, v in obj.items()
|
|
87
|
+
}
|
|
88
|
+
elif isinstance(obj, Collection) and not isinstance(obj, (str, bytes, Enum)):
|
|
89
|
+
return [
|
|
90
|
+
_asdict(
|
|
91
|
+
v,
|
|
92
|
+
encode_json=encode_json,
|
|
93
|
+
redact_sensitive=redact_sensitive,
|
|
94
|
+
redacted_text=redacted_text,
|
|
95
|
+
)
|
|
96
|
+
for v in obj
|
|
97
|
+
]
|
|
98
|
+
else:
|
|
99
|
+
return copy.deepcopy(obj)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import MISSING, Field
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.enhanced_dataclass.core import _recursive_repr
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EnhancedField(Field):
|
|
8
|
+
def __init__(self, *args, sensitive=False, overload_name: t.Optional[str] = None):
|
|
9
|
+
super().__init__(*args)
|
|
10
|
+
self.sensitive = sensitive
|
|
11
|
+
self.overload_name = overload_name
|
|
12
|
+
|
|
13
|
+
@_recursive_repr
|
|
14
|
+
def __repr__(self):
|
|
15
|
+
# Support for kw_only added in 3.10, to support as low as 3.8, need to dynamically map
|
|
16
|
+
fields_array = [
|
|
17
|
+
f"name={self.name!r}",
|
|
18
|
+
f"type={self.type!r}",
|
|
19
|
+
f"default={self.default!r}",
|
|
20
|
+
f"default_factory={self.default_factory!r}",
|
|
21
|
+
f"init={self.init!r}",
|
|
22
|
+
f"repr={self.repr!r}",
|
|
23
|
+
f"hash={self.hash!r}",
|
|
24
|
+
f"compare={self.compare!r}",
|
|
25
|
+
f"metadata={self.metadata!r}",
|
|
26
|
+
f"sensitive={self.sensitive!r}",
|
|
27
|
+
f"overload_name={self.overload_name!r}",
|
|
28
|
+
f"_field_type={self._field_type}",
|
|
29
|
+
]
|
|
30
|
+
if kw_only := getattr(self, "kw_only", None):
|
|
31
|
+
fields_array.append(f"kw_only={kw_only!r}")
|
|
32
|
+
return "Field({})".format(",".join(fields_array))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def enhanced_field(
|
|
36
|
+
*,
|
|
37
|
+
default=MISSING,
|
|
38
|
+
default_factory=MISSING,
|
|
39
|
+
init: bool = True,
|
|
40
|
+
repr: bool = True,
|
|
41
|
+
hash=None,
|
|
42
|
+
compare: bool = True,
|
|
43
|
+
metadata=None,
|
|
44
|
+
kw_only=MISSING,
|
|
45
|
+
sensitive: bool = False,
|
|
46
|
+
overload_name: t.Optional[str] = None,
|
|
47
|
+
):
|
|
48
|
+
if default is not MISSING and default_factory is not MISSING:
|
|
49
|
+
raise ValueError("cannot specify both default and default_factory")
|
|
50
|
+
args = [default, default_factory, init, repr, hash, compare, metadata]
|
|
51
|
+
# Support for kw_only added in 3.10, to support as low as 3.8, need to dynamically map
|
|
52
|
+
if "kw_only" in EnhancedField.__slots__:
|
|
53
|
+
args.append(kw_only)
|
|
54
|
+
return EnhancedField(*args, sensitive=sensitive, overload_name=overload_name)
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import InitVar, fields
|
|
5
|
+
from typing import Any, Callable, Optional, Type, TypeVar, Union
|
|
6
|
+
|
|
7
|
+
import dataclasses_json.core as dataclasses_json_core
|
|
8
|
+
from dataclasses_json import DataClassJsonMixin
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
11
|
+
|
|
12
|
+
A = TypeVar("A", bound="EnhancedDataClassJsonMixin")
|
|
13
|
+
|
|
14
|
+
# Monkey-patch _decode_dataclass class to support name override
|
|
15
|
+
og_decode_dataclass = dataclasses_json_core._decode_dataclass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def custom_decode_dataclass(cls, kvs, infer_missing):
|
|
19
|
+
dataclass_fields = fields(cls)
|
|
20
|
+
for f in [
|
|
21
|
+
field
|
|
22
|
+
for field in dataclass_fields
|
|
23
|
+
if hasattr(field, "overload_name") and getattr(field, "overload_name", None)
|
|
24
|
+
]:
|
|
25
|
+
field_name = f.name
|
|
26
|
+
overload_name = getattr(f, "overload_name")
|
|
27
|
+
if isinstance(kvs, dict) and overload_name in kvs:
|
|
28
|
+
kvs[field_name] = kvs.pop(overload_name)
|
|
29
|
+
return og_decode_dataclass(cls, kvs, infer_missing)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
dataclasses_json_core._decode_dataclass = custom_decode_dataclass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class EnhancedDataClassJsonMixin(DataClassJsonMixin):
|
|
36
|
+
"""A mixin class extending DataClassJsonMixin.
|
|
37
|
+
|
|
38
|
+
This class extends the functionality of DataClassJsonMixin to provide enhanced functionality
|
|
39
|
+
for JSON serialization and deserialization. It introduces options for redacting sensitive
|
|
40
|
+
information, custom encoding, and more advanced schema handling.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
N/A (No additional attributes)
|
|
44
|
+
|
|
45
|
+
Methods:
|
|
46
|
+
to_json: Serialize the object to JSON format with customizable options.
|
|
47
|
+
from_dict: Deserialize a dictionary into an object of this class.
|
|
48
|
+
to_dict: Convert the object to a dictionary with customizable options.
|
|
49
|
+
schema: Generate a schema for validating and parsing JSON data based on this class.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def check_init_var(cls):
|
|
54
|
+
ann = cls.__dict__.get("__annotations__", {})
|
|
55
|
+
init_vars = {k: v for k, v in ann.items() if isinstance(v, InitVar)}
|
|
56
|
+
if init_vars:
|
|
57
|
+
raise TypeError(
|
|
58
|
+
"Class {} has the following fields defined with an InitVar which "
|
|
59
|
+
"cannot be used with EnhancedDataClassJsonMixin: {}".format(
|
|
60
|
+
cls.__name__, ", ".join(init_vars.keys())
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def to_json(
|
|
65
|
+
self,
|
|
66
|
+
*,
|
|
67
|
+
skipkeys: bool = False,
|
|
68
|
+
ensure_ascii: bool = True,
|
|
69
|
+
check_circular: bool = True,
|
|
70
|
+
allow_nan: bool = True,
|
|
71
|
+
indent: Optional[Union[int, str]] = None,
|
|
72
|
+
separators: Optional[tuple[str, str]] = None,
|
|
73
|
+
default: Optional[Callable[..., Any]] = None,
|
|
74
|
+
sort_keys: bool = False,
|
|
75
|
+
redact_sensitive: bool = False,
|
|
76
|
+
redacted_text: str = "***REDACTED***",
|
|
77
|
+
apply_name_overload: bool = True,
|
|
78
|
+
**kw: Any,
|
|
79
|
+
) -> str:
|
|
80
|
+
self.check_init_var()
|
|
81
|
+
return json.dumps(
|
|
82
|
+
self.to_dict(
|
|
83
|
+
encode_json=False,
|
|
84
|
+
redact_sensitive=redact_sensitive,
|
|
85
|
+
redacted_text=redacted_text,
|
|
86
|
+
apply_name_overload=apply_name_overload,
|
|
87
|
+
),
|
|
88
|
+
cls=dataclasses_json_core._ExtendedEncoder,
|
|
89
|
+
skipkeys=skipkeys,
|
|
90
|
+
ensure_ascii=ensure_ascii,
|
|
91
|
+
check_circular=check_circular,
|
|
92
|
+
allow_nan=allow_nan,
|
|
93
|
+
indent=indent,
|
|
94
|
+
separators=separators,
|
|
95
|
+
default=default,
|
|
96
|
+
sort_keys=sort_keys,
|
|
97
|
+
**kw,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def from_dict(
|
|
102
|
+
cls: Type[A],
|
|
103
|
+
kvs: dataclasses_json_core.Json,
|
|
104
|
+
*,
|
|
105
|
+
infer_missing=False,
|
|
106
|
+
apply_name_overload=False,
|
|
107
|
+
) -> A:
|
|
108
|
+
cls.check_init_var()
|
|
109
|
+
return dataclasses_json_core._decode_dataclass(cls, kvs, infer_missing)
|
|
110
|
+
|
|
111
|
+
def to_dict(
|
|
112
|
+
self,
|
|
113
|
+
encode_json: bool = False,
|
|
114
|
+
redact_sensitive: bool = False,
|
|
115
|
+
redacted_text: str = "***REDACTED***",
|
|
116
|
+
apply_name_overload: bool = True,
|
|
117
|
+
) -> dict[str, dataclasses_json_core.Json]:
|
|
118
|
+
self.check_init_var()
|
|
119
|
+
return _asdict(
|
|
120
|
+
self,
|
|
121
|
+
encode_json=encode_json,
|
|
122
|
+
redact_sensitive=redact_sensitive,
|
|
123
|
+
redacted_text=redacted_text,
|
|
124
|
+
apply_name_overload=apply_name_overload,
|
|
125
|
+
)
|