unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
5
|
+
|
|
6
|
+
from unstructured import __name__ as integration_name
|
|
7
|
+
from unstructured.__version__ import __version__ as integration_version
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.interfaces import (
|
|
13
|
+
AccessConfig,
|
|
14
|
+
ConnectionConfig,
|
|
15
|
+
FileData,
|
|
16
|
+
UploadContent,
|
|
17
|
+
Uploader,
|
|
18
|
+
UploaderConfig,
|
|
19
|
+
UploadStager,
|
|
20
|
+
UploadStagerConfig,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.v2.logger import logger
|
|
23
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
24
|
+
DestinationRegistryEntry,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from astrapy.db import AstraDBCollection
|
|
29
|
+
|
|
30
|
+
CONNECTOR_TYPE = "astra"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class AstraAccessConfig(AccessConfig):
|
|
35
|
+
token: str
|
|
36
|
+
api_endpoint: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class AstraConnectionConfig(ConnectionConfig):
|
|
41
|
+
connection_type: str = CONNECTOR_TYPE
|
|
42
|
+
access_config: AstraAccessConfig = enhanced_field(sensitive=True)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class AstraUploadStagerConfig(UploadStagerConfig):
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class AstraUploadStager(UploadStager):
|
|
52
|
+
upload_stager_config: AstraUploadStagerConfig = field(
|
|
53
|
+
default_factory=lambda: AstraUploadStagerConfig()
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def conform_dict(self, element_dict: dict) -> dict:
|
|
57
|
+
return {
|
|
58
|
+
"$vector": element_dict.pop("embeddings", None),
|
|
59
|
+
"content": element_dict.pop("text", None),
|
|
60
|
+
"metadata": element_dict,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
def run(
|
|
64
|
+
self,
|
|
65
|
+
elements_filepath: Path,
|
|
66
|
+
file_data: FileData,
|
|
67
|
+
output_dir: Path,
|
|
68
|
+
output_filename: str,
|
|
69
|
+
**kwargs: Any,
|
|
70
|
+
) -> Path:
|
|
71
|
+
with open(elements_filepath) as elements_file:
|
|
72
|
+
elements_contents = json.load(elements_file)
|
|
73
|
+
conformed_elements = []
|
|
74
|
+
for element in elements_contents:
|
|
75
|
+
conformed_elements.append(self.conform_dict(element_dict=element))
|
|
76
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
77
|
+
with open(output_path, "w") as output_file:
|
|
78
|
+
json.dump(conformed_elements, output_file)
|
|
79
|
+
return output_path
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class AstraUploaderConfig(UploaderConfig):
|
|
84
|
+
collection_name: str
|
|
85
|
+
embedding_dimension: int
|
|
86
|
+
namespace: Optional[str] = None
|
|
87
|
+
requested_indexing_policy: Optional[dict[str, Any]] = None
|
|
88
|
+
batch_size: int = 20
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class AstraUploader(Uploader):
|
|
93
|
+
connection_config: AstraConnectionConfig
|
|
94
|
+
upload_config: AstraUploaderConfig
|
|
95
|
+
connector_type: str = CONNECTOR_TYPE
|
|
96
|
+
|
|
97
|
+
@requires_dependencies(["astrapy"], extras="astra")
|
|
98
|
+
def get_collection(self) -> "AstraDBCollection":
|
|
99
|
+
from astrapy.db import AstraDB
|
|
100
|
+
|
|
101
|
+
# Get the collection_name and embedding dimension
|
|
102
|
+
collection_name = self.upload_config.collection_name
|
|
103
|
+
embedding_dimension = self.upload_config.embedding_dimension
|
|
104
|
+
requested_indexing_policy = self.upload_config.requested_indexing_policy
|
|
105
|
+
|
|
106
|
+
# If the user has requested an indexing policy, pass it to the AstraDB
|
|
107
|
+
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
|
|
108
|
+
|
|
109
|
+
# Build the Astra DB object.
|
|
110
|
+
# caller_name/version for AstraDB tracking
|
|
111
|
+
astra_db = AstraDB(
|
|
112
|
+
api_endpoint=self.connection_config.access_config.api_endpoint,
|
|
113
|
+
token=self.connection_config.access_config.token,
|
|
114
|
+
namespace=self.upload_config.namespace,
|
|
115
|
+
caller_name=integration_name,
|
|
116
|
+
caller_version=integration_version,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Create and connect to the newly created collection
|
|
120
|
+
astra_db_collection = astra_db.create_collection(
|
|
121
|
+
collection_name=collection_name,
|
|
122
|
+
dimension=embedding_dimension,
|
|
123
|
+
options=options,
|
|
124
|
+
)
|
|
125
|
+
return astra_db_collection
|
|
126
|
+
|
|
127
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
128
|
+
elements_dict = []
|
|
129
|
+
for content in contents:
|
|
130
|
+
with open(content.path) as elements_file:
|
|
131
|
+
elements = json.load(elements_file)
|
|
132
|
+
elements_dict.extend(elements)
|
|
133
|
+
|
|
134
|
+
logger.info(
|
|
135
|
+
f"writing {len(elements_dict)} objects to destination "
|
|
136
|
+
f"collection {self.upload_config.collection_name}"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
astra_batch_size = self.upload_config.batch_size
|
|
140
|
+
collection = self.get_collection()
|
|
141
|
+
|
|
142
|
+
for chunk in batch_generator(elements_dict, astra_batch_size):
|
|
143
|
+
collection.insert_many(chunk)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
astra_destination_entry = DestinationRegistryEntry(
|
|
147
|
+
connection_config=AstraConnectionConfig,
|
|
148
|
+
upload_stager_config=AstraUploadStagerConfig,
|
|
149
|
+
upload_stager=AstraUploadStager,
|
|
150
|
+
uploader_config=AstraUploaderConfig,
|
|
151
|
+
uploader=AstraUploader,
|
|
152
|
+
)
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import typing as t
|
|
3
|
+
import uuid
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
|
+
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
9
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
ConnectionConfig,
|
|
14
|
+
UploadContent,
|
|
15
|
+
Uploader,
|
|
16
|
+
UploaderConfig,
|
|
17
|
+
UploadStager,
|
|
18
|
+
UploadStagerConfig,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.v2.logger import logger
|
|
21
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
|
+
DestinationRegistryEntry,
|
|
23
|
+
add_destination_entry,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
|
|
26
|
+
|
|
27
|
+
if t.TYPE_CHECKING:
|
|
28
|
+
from azure.search.documents import SearchClient
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
CONNECTOR_TYPE = "azure_cognitive_search"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class AzureCognitiveSearchAccessConfig(AccessConfig):
|
|
36
|
+
key: t.Optional[str] = enhanced_field(default=None, overload_name="azure_cognitive_search_key")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
41
|
+
endpoint: str
|
|
42
|
+
index: str
|
|
43
|
+
access_config: AzureCognitiveSearchAccessConfig = enhanced_field(sensitive=True)
|
|
44
|
+
|
|
45
|
+
@requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
|
|
46
|
+
def generate_client(self) -> "SearchClient":
|
|
47
|
+
from azure.core.credentials import AzureKeyCredential
|
|
48
|
+
from azure.search.documents import SearchClient
|
|
49
|
+
|
|
50
|
+
return SearchClient(
|
|
51
|
+
endpoint=self.endpoint,
|
|
52
|
+
index_name=self.index,
|
|
53
|
+
credential=AzureKeyCredential(self.access_config.key),
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class AzureCognitiveSearchUploaderConfig(UploaderConfig):
|
|
64
|
+
batch_size: int = 100
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class AzureCognitiveSearchUploadStager(UploadStager):
|
|
69
|
+
upload_stager_config: AzureCognitiveSearchUploadStagerConfig = field(
|
|
70
|
+
default_factory=lambda: AzureCognitiveSearchUploadStagerConfig()
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def conform_dict(data: dict) -> dict:
|
|
75
|
+
"""
|
|
76
|
+
updates the dictionary that is from each Element being converted into a dict/json
|
|
77
|
+
into a dictionary that conforms to the schema expected by the
|
|
78
|
+
Azure Cognitive Search index
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
data["id"] = str(uuid.uuid4())
|
|
82
|
+
|
|
83
|
+
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
84
|
+
data["metadata"]["coordinates"]["points"] = json.dumps(points)
|
|
85
|
+
if version := data.get("metadata", {}).get("data_source", {}).get("version"):
|
|
86
|
+
data["metadata"]["data_source"]["version"] = str(version)
|
|
87
|
+
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
|
|
88
|
+
data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
|
|
89
|
+
if permissions_data := (
|
|
90
|
+
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
|
91
|
+
):
|
|
92
|
+
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
|
|
93
|
+
if links := data.get("metadata", {}).get("links"):
|
|
94
|
+
data["metadata"]["links"] = [json.dumps(link) for link in links]
|
|
95
|
+
if last_modified := data.get("metadata", {}).get("last_modified"):
|
|
96
|
+
data["metadata"]["last_modified"] = parse_datetime(last_modified).strftime(
|
|
97
|
+
"%Y-%m-%dT%H:%M:%S.%fZ"
|
|
98
|
+
)
|
|
99
|
+
if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
|
|
100
|
+
data["metadata"]["data_source"]["date_created"] = parse_datetime(date_created).strftime(
|
|
101
|
+
"%Y-%m-%dT%H:%M:%S.%fZ"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
|
|
105
|
+
data["metadata"]["data_source"]["date_modified"] = parse_datetime(
|
|
106
|
+
date_modified
|
|
107
|
+
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
108
|
+
|
|
109
|
+
if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
|
|
110
|
+
data["metadata"]["data_source"]["date_processed"] = parse_datetime(
|
|
111
|
+
date_processed
|
|
112
|
+
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
113
|
+
|
|
114
|
+
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
115
|
+
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
|
|
116
|
+
if page_number := data.get("metadata", {}).get("page_number"):
|
|
117
|
+
data["metadata"]["page_number"] = str(page_number)
|
|
118
|
+
return data
|
|
119
|
+
|
|
120
|
+
def run(
|
|
121
|
+
self,
|
|
122
|
+
elements_filepath: Path,
|
|
123
|
+
output_dir: Path,
|
|
124
|
+
output_filename: str,
|
|
125
|
+
**kwargs: t.Any,
|
|
126
|
+
) -> Path:
|
|
127
|
+
with open(elements_filepath) as elements_file:
|
|
128
|
+
elements_contents = json.load(elements_file)
|
|
129
|
+
|
|
130
|
+
conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
|
|
131
|
+
|
|
132
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
133
|
+
with open(output_path, "w") as output_file:
|
|
134
|
+
json.dump(conformed_elements, output_file)
|
|
135
|
+
return output_path
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class AzureCognitiveSearchUploader(Uploader):
|
|
140
|
+
upload_config: AzureCognitiveSearchUploaderConfig
|
|
141
|
+
connection_config: AzureCognitiveSearchConnectionConfig
|
|
142
|
+
connector_type: str = CONNECTOR_TYPE
|
|
143
|
+
|
|
144
|
+
@DestinationConnectionError.wrap
|
|
145
|
+
@requires_dependencies(["azure"], extras="azure-cognitive-search")
|
|
146
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
147
|
+
import azure.core.exceptions
|
|
148
|
+
|
|
149
|
+
logger.info(
|
|
150
|
+
f"writing {len(elements_dict)} documents to destination "
|
|
151
|
+
f"index at {self.connection_config.index}",
|
|
152
|
+
)
|
|
153
|
+
try:
|
|
154
|
+
results = self.connection_config.generate_client().upload_documents(
|
|
155
|
+
documents=elements_dict
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
except azure.core.exceptions.HttpResponseError as http_error:
|
|
159
|
+
raise WriteError(f"http error: {http_error}") from http_error
|
|
160
|
+
errors = []
|
|
161
|
+
success = []
|
|
162
|
+
for result in results:
|
|
163
|
+
if result.succeeded:
|
|
164
|
+
success.append(result)
|
|
165
|
+
else:
|
|
166
|
+
errors.append(result)
|
|
167
|
+
logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
|
|
168
|
+
if errors:
|
|
169
|
+
raise WriteError(
|
|
170
|
+
", ".join(
|
|
171
|
+
[
|
|
172
|
+
f"{error.key}: [{error.status_code}] {error.error_message}"
|
|
173
|
+
for error in errors
|
|
174
|
+
],
|
|
175
|
+
),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def write_dict_wrapper(self, elements_dict):
|
|
179
|
+
return self.write_dict(elements_dict=elements_dict)
|
|
180
|
+
|
|
181
|
+
def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None:
|
|
182
|
+
|
|
183
|
+
elements_dict = []
|
|
184
|
+
for content in contents:
|
|
185
|
+
with open(content.path) as elements_file:
|
|
186
|
+
elements = json.load(elements_file)
|
|
187
|
+
elements_dict.extend(elements)
|
|
188
|
+
|
|
189
|
+
logger.info(
|
|
190
|
+
f"writing document batches to destination"
|
|
191
|
+
f" endpoint at {str(self.connection_config.endpoint)}"
|
|
192
|
+
f" index at {str(self.connection_config.index)}"
|
|
193
|
+
f" with batch size {str(self.upload_config.batch_size)}"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
batch_size = self.upload_config.batch_size
|
|
197
|
+
|
|
198
|
+
for chunk in batch_generator(elements_dict, batch_size):
|
|
199
|
+
self.write_dict(elements_dict=chunk) # noqa: E203
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
add_destination_entry(
|
|
203
|
+
destination_type=CONNECTOR_TYPE,
|
|
204
|
+
entry=DestinationRegistryEntry(
|
|
205
|
+
connection_config=AzureCognitiveSearchConnectionConfig,
|
|
206
|
+
uploader=AzureCognitiveSearchUploader,
|
|
207
|
+
uploader_config=AzureCognitiveSearchUploaderConfig,
|
|
208
|
+
upload_stager=AzureCognitiveSearchUploadStager,
|
|
209
|
+
upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
|
|
210
|
+
),
|
|
211
|
+
)
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import date, datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional
|
|
7
|
+
|
|
8
|
+
from dateutil import parser
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
12
|
+
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
from unstructured_ingest.v2.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
FileData,
|
|
18
|
+
UploadContent,
|
|
19
|
+
Uploader,
|
|
20
|
+
UploaderConfig,
|
|
21
|
+
UploadStager,
|
|
22
|
+
UploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
26
|
+
DestinationRegistryEntry,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from chromadb import Client
|
|
31
|
+
|
|
32
|
+
CONNECTOR_TYPE = "chroma"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class ChromaAccessConfig(AccessConfig):
|
|
37
|
+
settings: Optional[Dict[str, str]] = None
|
|
38
|
+
headers: Optional[Dict[str, str]] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class ChromaConnectionConfig(ConnectionConfig):
|
|
43
|
+
collection_name: str
|
|
44
|
+
access_config: ChromaAccessConfig = enhanced_field(sensitive=True)
|
|
45
|
+
path: Optional[str] = None
|
|
46
|
+
tenant: Optional[str] = "default_tenant"
|
|
47
|
+
database: Optional[str] = "default_database"
|
|
48
|
+
host: Optional[str] = None
|
|
49
|
+
port: Optional[int] = None
|
|
50
|
+
ssl: bool = False
|
|
51
|
+
connector_type: str = CONNECTOR_TYPE
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class ChromaUploadStagerConfig(UploadStagerConfig):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class ChromaUploadStager(UploadStager):
|
|
61
|
+
upload_stager_config: ChromaUploadStagerConfig = field(
|
|
62
|
+
default_factory=lambda: ChromaUploadStagerConfig()
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def parse_date_string(date_string: str) -> date:
|
|
67
|
+
try:
|
|
68
|
+
timestamp = float(date_string)
|
|
69
|
+
return datetime.fromtimestamp(timestamp)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.debug(f"date {date_string} string not a timestamp: {e}")
|
|
72
|
+
return parser.parse(date_string)
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def conform_dict(data: dict) -> dict:
|
|
76
|
+
"""
|
|
77
|
+
Prepares dictionary in the format that Chroma requires
|
|
78
|
+
"""
|
|
79
|
+
element_id = data.get("element_id", str(uuid.uuid4()))
|
|
80
|
+
return {
|
|
81
|
+
"id": element_id,
|
|
82
|
+
"embedding": data.pop("embeddings", None),
|
|
83
|
+
"document": data.pop("text", None),
|
|
84
|
+
"metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
def run(
|
|
88
|
+
self,
|
|
89
|
+
elements_filepath: Path,
|
|
90
|
+
file_data: FileData,
|
|
91
|
+
output_dir: Path,
|
|
92
|
+
output_filename: str,
|
|
93
|
+
**kwargs: Any,
|
|
94
|
+
) -> Path:
|
|
95
|
+
with open(elements_filepath) as elements_file:
|
|
96
|
+
elements_contents = json.load(elements_file)
|
|
97
|
+
conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
|
|
98
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
99
|
+
with open(output_path, "w") as output_file:
|
|
100
|
+
json.dump(conformed_elements, output_file)
|
|
101
|
+
return output_path
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class ChromaUploaderConfig(UploaderConfig):
|
|
106
|
+
batch_size: int = 100
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass
|
|
110
|
+
class ChromaUploader(Uploader):
|
|
111
|
+
connector_type: str = CONNECTOR_TYPE
|
|
112
|
+
upload_config: ChromaUploaderConfig
|
|
113
|
+
connection_config: ChromaConnectionConfig
|
|
114
|
+
client: Optional["Client"] = field(init=False)
|
|
115
|
+
|
|
116
|
+
def __post_init__(self):
|
|
117
|
+
self.client = self.create_client()
|
|
118
|
+
|
|
119
|
+
@requires_dependencies(["chromadb"], extras="chroma")
|
|
120
|
+
def create_client(self) -> "Client":
|
|
121
|
+
import chromadb
|
|
122
|
+
|
|
123
|
+
if self.connection_config.path:
|
|
124
|
+
return chromadb.PersistentClient(
|
|
125
|
+
path=self.connection_config.path,
|
|
126
|
+
settings=self.connection_config.access_config.settings,
|
|
127
|
+
tenant=self.connection_config.tenant,
|
|
128
|
+
database=self.connection_config.database,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
elif self.connection_config.host and self.connection_config.port:
|
|
132
|
+
return chromadb.HttpClient(
|
|
133
|
+
host=self.connection_config.host,
|
|
134
|
+
port=self.connection_config.port,
|
|
135
|
+
ssl=self.connection_config.ssl,
|
|
136
|
+
headers=self.connection_config.access_config.headers,
|
|
137
|
+
settings=self.connection_config.access_config.settings,
|
|
138
|
+
tenant=self.connection_config.tenant,
|
|
139
|
+
database=self.connection_config.database,
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
raise ValueError("Chroma connector requires either path or host and port to be set.")
|
|
143
|
+
|
|
144
|
+
@DestinationConnectionError.wrap
|
|
145
|
+
def upsert_batch(self, collection, batch):
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
# Chroma wants lists even if there is only one element
|
|
149
|
+
# Upserting to prevent duplicates
|
|
150
|
+
collection.upsert(
|
|
151
|
+
ids=batch["ids"],
|
|
152
|
+
documents=batch["documents"],
|
|
153
|
+
embeddings=batch["embeddings"],
|
|
154
|
+
metadatas=batch["metadatas"],
|
|
155
|
+
)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
raise ValueError(f"chroma error: {e}") from e
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def prepare_chroma_list(chunk: tuple[dict[str, Any]]) -> dict[str, list[Any]]:
|
|
161
|
+
"""Helper function to break a tuple of dicts into list of parallel lists for ChromaDb.
|
|
162
|
+
({'id':1}, {'id':2}, {'id':3}) -> {'ids':[1,2,3]}"""
|
|
163
|
+
chroma_dict = {}
|
|
164
|
+
chroma_dict["ids"] = [x.get("id") for x in chunk]
|
|
165
|
+
chroma_dict["documents"] = [x.get("document") for x in chunk]
|
|
166
|
+
chroma_dict["embeddings"] = [x.get("embedding") for x in chunk]
|
|
167
|
+
chroma_dict["metadatas"] = [x.get("metadata") for x in chunk]
|
|
168
|
+
# Make sure all lists are of the same length
|
|
169
|
+
assert (
|
|
170
|
+
len(chroma_dict["ids"])
|
|
171
|
+
== len(chroma_dict["documents"])
|
|
172
|
+
== len(chroma_dict["embeddings"])
|
|
173
|
+
== len(chroma_dict["metadatas"])
|
|
174
|
+
)
|
|
175
|
+
return chroma_dict
|
|
176
|
+
|
|
177
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
178
|
+
|
|
179
|
+
elements_dict = []
|
|
180
|
+
for content in contents:
|
|
181
|
+
with open(content.path) as elements_file:
|
|
182
|
+
elements = json.load(elements_file)
|
|
183
|
+
elements_dict.extend(elements)
|
|
184
|
+
|
|
185
|
+
logger.info(
|
|
186
|
+
f"writing {len(elements_dict)} objects to destination "
|
|
187
|
+
f"collection {self.connection_config.collection_name} "
|
|
188
|
+
f"at {self.connection_config.host}",
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
collection = self.client.get_or_create_collection(
|
|
192
|
+
name=self.connection_config.collection_name
|
|
193
|
+
)
|
|
194
|
+
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
195
|
+
self.upsert_batch(collection, self.prepare_chroma_list(chunk))
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
chroma_destination_entry = DestinationRegistryEntry(
|
|
199
|
+
connection_config=ChromaConnectionConfig,
|
|
200
|
+
uploader=ChromaUploader,
|
|
201
|
+
uploader_config=ChromaUploaderConfig,
|
|
202
|
+
upload_stager=ChromaUploadStager,
|
|
203
|
+
upload_stager_config=ChromaUploadStagerConfig,
|
|
204
|
+
)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
6
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
7
|
+
from unstructured_ingest.v2.interfaces import (
|
|
8
|
+
AccessConfig,
|
|
9
|
+
ConnectionConfig,
|
|
10
|
+
UploadContent,
|
|
11
|
+
Uploader,
|
|
12
|
+
UploaderConfig,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from databricks.sdk import WorkspaceClient
|
|
18
|
+
|
|
19
|
+
CONNECTOR_TYPE = "databricks_volumes"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class DatabricksVolumesAccessConfig(AccessConfig):
|
|
24
|
+
account_id: Optional[str] = None
|
|
25
|
+
username: Optional[str] = None
|
|
26
|
+
password: Optional[str] = None
|
|
27
|
+
client_id: Optional[str] = None
|
|
28
|
+
client_secret: Optional[str] = None
|
|
29
|
+
token: Optional[str] = None
|
|
30
|
+
profile: Optional[str] = None
|
|
31
|
+
azure_workspace_resource_id: Optional[str] = None
|
|
32
|
+
azure_client_secret: Optional[str] = None
|
|
33
|
+
azure_client_id: Optional[str] = None
|
|
34
|
+
azure_tenant_id: Optional[str] = None
|
|
35
|
+
azure_environment: Optional[str] = None
|
|
36
|
+
auth_type: Optional[str] = None
|
|
37
|
+
cluster_id: Optional[str] = None
|
|
38
|
+
google_credentials: Optional[str] = None
|
|
39
|
+
google_service_account: Optional[str] = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class DatabricksVolumesConnectionConfig(ConnectionConfig):
|
|
44
|
+
access_config: DatabricksVolumesAccessConfig = enhanced_field(
|
|
45
|
+
default_factory=DatabricksVolumesAccessConfig, sensitive=True
|
|
46
|
+
)
|
|
47
|
+
host: Optional[str] = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class DatabricksVolumesUploaderConfig(UploaderConfig):
|
|
52
|
+
volume: str
|
|
53
|
+
catalog: str
|
|
54
|
+
volume_path: Optional[str] = None
|
|
55
|
+
overwrite: bool = False
|
|
56
|
+
schema: str = "default"
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def path(self) -> str:
|
|
60
|
+
path = f"/Volumes/{self.catalog}/{self.schema}/{self.volume}"
|
|
61
|
+
if self.volume_path:
|
|
62
|
+
path = f"{path}/{self.volume_path}"
|
|
63
|
+
return path
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class DatabricksVolumesUploader(Uploader):
|
|
68
|
+
connector_type: str = CONNECTOR_TYPE
|
|
69
|
+
upload_config: DatabricksVolumesUploaderConfig
|
|
70
|
+
connection_config: DatabricksVolumesConnectionConfig
|
|
71
|
+
client: Optional["WorkspaceClient"] = field(init=False, default=None)
|
|
72
|
+
|
|
73
|
+
@requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
|
|
74
|
+
def __post_init__(self) -> "WorkspaceClient":
|
|
75
|
+
from databricks.sdk import WorkspaceClient
|
|
76
|
+
|
|
77
|
+
self.client = WorkspaceClient(
|
|
78
|
+
host=self.connection_config.host, **self.connection_config.access_config.to_dict()
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
82
|
+
for content in contents:
|
|
83
|
+
with open(content.path, "rb") as elements_file:
|
|
84
|
+
output_path = os.path.join(self.upload_config.path, content.path.name)
|
|
85
|
+
self.client.files.upload(
|
|
86
|
+
file_path=output_path,
|
|
87
|
+
contents=elements_file,
|
|
88
|
+
overwrite=self.upload_config.overwrite,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
databricks_volumes_destination_entry = DestinationRegistryEntry(
|
|
93
|
+
connection_config=DatabricksVolumesConnectionConfig,
|
|
94
|
+
uploader=DatabricksVolumesUploader,
|
|
95
|
+
uploader_config=DatabricksVolumesUploaderConfig,
|
|
96
|
+
)
|