unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
import json
|
|
3
|
+
import uuid
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import date, datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional, Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from dateutil import parser
|
|
12
|
+
|
|
13
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
14
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
|
+
from unstructured_ingest.v2.interfaces import (
|
|
16
|
+
AccessConfig,
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
FileData,
|
|
19
|
+
UploadContent,
|
|
20
|
+
Uploader,
|
|
21
|
+
UploaderConfig,
|
|
22
|
+
UploadStager,
|
|
23
|
+
UploadStagerConfig,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.logger import logger
|
|
26
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
27
|
+
|
|
28
|
+
CONNECTOR_TYPE = "sql"
|
|
29
|
+
ELEMENTS_TABLE_NAME = "elements"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class SQLAccessConfig(AccessConfig):
|
|
34
|
+
username: Optional[str] = None
|
|
35
|
+
password: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DatabaseType(str, enum.Enum):
|
|
39
|
+
SQLITE = "sqlite"
|
|
40
|
+
POSTGRESQL = "postgresql"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class SimpleSqlConfig(ConnectionConfig):
|
|
45
|
+
db_type: DatabaseType = (
|
|
46
|
+
# required default value here because of parent class
|
|
47
|
+
DatabaseType.SQLITE
|
|
48
|
+
)
|
|
49
|
+
database: Optional[str] = None
|
|
50
|
+
host: Optional[str] = None
|
|
51
|
+
port: Optional[int] = 5432
|
|
52
|
+
access_config: Optional[SQLAccessConfig] = enhanced_field(default=None, sensitive=True)
|
|
53
|
+
connector_type: str = CONNECTOR_TYPE
|
|
54
|
+
|
|
55
|
+
def __post_init__(self):
|
|
56
|
+
if (self.db_type == DatabaseType.SQLITE) and (self.database is None):
|
|
57
|
+
raise ValueError(
|
|
58
|
+
"A sqlite connection requires a path to a *.db file "
|
|
59
|
+
"through the `database` argument"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class SQLUploadStagerConfig(UploadStagerConfig):
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
_COLUMNS = (
|
|
69
|
+
"id",
|
|
70
|
+
"element_id",
|
|
71
|
+
"text",
|
|
72
|
+
"embeddings",
|
|
73
|
+
"type",
|
|
74
|
+
"system",
|
|
75
|
+
"layout_width",
|
|
76
|
+
"layout_height",
|
|
77
|
+
"points",
|
|
78
|
+
"url",
|
|
79
|
+
"version",
|
|
80
|
+
"date_created",
|
|
81
|
+
"date_modified",
|
|
82
|
+
"date_processed",
|
|
83
|
+
"permissions_data",
|
|
84
|
+
"record_locator",
|
|
85
|
+
"category_depth",
|
|
86
|
+
"parent_id",
|
|
87
|
+
"attached_filename",
|
|
88
|
+
"filetype",
|
|
89
|
+
"last_modified",
|
|
90
|
+
"file_directory",
|
|
91
|
+
"filename",
|
|
92
|
+
"languages",
|
|
93
|
+
"page_number",
|
|
94
|
+
"links",
|
|
95
|
+
"page_name",
|
|
96
|
+
"link_urls",
|
|
97
|
+
"link_texts",
|
|
98
|
+
"sent_from",
|
|
99
|
+
"sent_to",
|
|
100
|
+
"subject",
|
|
101
|
+
"section",
|
|
102
|
+
"header_footer_type",
|
|
103
|
+
"emphasized_text_contents",
|
|
104
|
+
"emphasized_text_tags",
|
|
105
|
+
"text_as_html",
|
|
106
|
+
"regex_metadata",
|
|
107
|
+
"detection_class_prob",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def parse_date_string(date_value: Union[str, int]) -> date:
|
|
114
|
+
try:
|
|
115
|
+
timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
|
|
116
|
+
return datetime.fromtimestamp(timestamp)
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger.debug(f"date {date_value} string not a timestamp: {e}")
|
|
119
|
+
return parser.parse(date_value)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@dataclass
|
|
123
|
+
class SQLUploadStager(UploadStager):
|
|
124
|
+
upload_stager_config: SQLUploadStagerConfig = field(
|
|
125
|
+
default_factory=lambda: SQLUploadStagerConfig()
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def run(
|
|
129
|
+
self,
|
|
130
|
+
elements_filepath: Path,
|
|
131
|
+
file_data: FileData,
|
|
132
|
+
output_dir: Path,
|
|
133
|
+
output_filename: str,
|
|
134
|
+
**kwargs: Any,
|
|
135
|
+
) -> Path:
|
|
136
|
+
with open(elements_filepath) as elements_file:
|
|
137
|
+
elements_contents = json.load(elements_file)
|
|
138
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
139
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
140
|
+
|
|
141
|
+
output = []
|
|
142
|
+
for data in elements_contents:
|
|
143
|
+
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
144
|
+
data_source = metadata.pop("data_source", {})
|
|
145
|
+
coordinates = metadata.pop("coordinates", {})
|
|
146
|
+
|
|
147
|
+
data.update(metadata)
|
|
148
|
+
data.update(data_source)
|
|
149
|
+
data.update(coordinates)
|
|
150
|
+
|
|
151
|
+
data["id"] = str(uuid.uuid4())
|
|
152
|
+
|
|
153
|
+
# remove extraneous, not supported columns
|
|
154
|
+
[data.pop(column) for column in data if column not in _COLUMNS]
|
|
155
|
+
|
|
156
|
+
output.append(data)
|
|
157
|
+
|
|
158
|
+
df = pd.DataFrame.from_dict(output)
|
|
159
|
+
for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
|
|
160
|
+
df[column] = df[column].apply(parse_date_string)
|
|
161
|
+
for column in filter(
|
|
162
|
+
lambda x: x in df.columns,
|
|
163
|
+
("permissions_data", "record_locator", "points", "links"),
|
|
164
|
+
):
|
|
165
|
+
df[column] = df[column].apply(
|
|
166
|
+
lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
|
|
167
|
+
)
|
|
168
|
+
for column in filter(
|
|
169
|
+
lambda x: x in df.columns,
|
|
170
|
+
("version", "page_number", "regex_metadata"),
|
|
171
|
+
):
|
|
172
|
+
df[column] = df[column].apply(str)
|
|
173
|
+
|
|
174
|
+
with output_path.open("w") as output_file:
|
|
175
|
+
df.to_json(output_file, orient="records", lines=True)
|
|
176
|
+
return output_path
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@dataclass
|
|
180
|
+
class SQLUploaderConfig(UploaderConfig):
|
|
181
|
+
batch_size: int = 50
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@dataclass
|
|
185
|
+
class SQLUploader(Uploader):
|
|
186
|
+
connector_type: str = CONNECTOR_TYPE
|
|
187
|
+
upload_config: SQLUploaderConfig
|
|
188
|
+
connection_config: SimpleSqlConfig
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def connection(self):
|
|
192
|
+
if self.connection_config.db_type == DatabaseType.POSTGRESQL:
|
|
193
|
+
return self._make_psycopg_connection
|
|
194
|
+
elif self.connection_config.db_type == DatabaseType.SQLITE:
|
|
195
|
+
return self._make_sqlite_connection
|
|
196
|
+
raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
|
|
197
|
+
|
|
198
|
+
def _make_sqlite_connection(self):
|
|
199
|
+
from sqlite3 import connect
|
|
200
|
+
|
|
201
|
+
return connect(database=self.connection_config.database)
|
|
202
|
+
|
|
203
|
+
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
204
|
+
def _make_psycopg_connection(self):
|
|
205
|
+
from psycopg2 import connect
|
|
206
|
+
|
|
207
|
+
return connect(
|
|
208
|
+
user=self.connection_config.access_config.username,
|
|
209
|
+
password=self.connection_config.access_config.password,
|
|
210
|
+
dbname=self.connection_config.database,
|
|
211
|
+
host=self.connection_config.host,
|
|
212
|
+
port=self.connection_config.port,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def prepare_data(
|
|
216
|
+
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
217
|
+
) -> list[tuple[Any, ...]]:
|
|
218
|
+
output = []
|
|
219
|
+
for row in data:
|
|
220
|
+
parsed = []
|
|
221
|
+
for column_name, value in zip(columns, row):
|
|
222
|
+
if self.connection_config.db_type == DatabaseType.SQLITE and isinstance(
|
|
223
|
+
value, (list, dict)
|
|
224
|
+
):
|
|
225
|
+
value = json.dumps(value)
|
|
226
|
+
if column_name in _DATE_COLUMNS:
|
|
227
|
+
if value is None:
|
|
228
|
+
parsed.append(None)
|
|
229
|
+
else:
|
|
230
|
+
parsed.append(parse_date_string(value))
|
|
231
|
+
else:
|
|
232
|
+
parsed.append(value)
|
|
233
|
+
output.append(tuple(parsed))
|
|
234
|
+
return output
|
|
235
|
+
|
|
236
|
+
def upload_contents(self, content: UploadContent) -> None:
|
|
237
|
+
df = pd.read_json(content.path, orient="records", lines=True)
|
|
238
|
+
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
239
|
+
df.replace({np.nan: None}, inplace=True)
|
|
240
|
+
|
|
241
|
+
columns = tuple(df.columns)
|
|
242
|
+
stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \
|
|
243
|
+
VALUES({','.join(['?' if self.connection_config.db_type==DatabaseType.SQLITE else '%s' for x in columns])})" # noqa E501
|
|
244
|
+
|
|
245
|
+
for rows in pd.read_json(
|
|
246
|
+
content.path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
247
|
+
):
|
|
248
|
+
with self.connection() as conn:
|
|
249
|
+
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
250
|
+
if self.connection_config.db_type == DatabaseType.SQLITE:
|
|
251
|
+
conn.executemany(stmt, values)
|
|
252
|
+
else:
|
|
253
|
+
with conn.cursor() as cur:
|
|
254
|
+
cur.executemany(stmt, values)
|
|
255
|
+
|
|
256
|
+
conn.commit()
|
|
257
|
+
|
|
258
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
259
|
+
for content in contents:
|
|
260
|
+
self.upload_contents(content=content)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
sql_destination_entry = DestinationRegistryEntry(
|
|
264
|
+
connection_config=SimpleSqlConfig,
|
|
265
|
+
uploader=SQLUploader,
|
|
266
|
+
uploader_config=SQLUploaderConfig,
|
|
267
|
+
upload_stager=SQLUploadStager,
|
|
268
|
+
upload_stager_config=SQLUploadStagerConfig,
|
|
269
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
from dateutil import parser
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime:
|
|
8
|
+
if isinstance(date_value, datetime):
|
|
9
|
+
return date_value
|
|
10
|
+
elif isinstance(date_value, float):
|
|
11
|
+
return datetime.fromtimestamp(date_value)
|
|
12
|
+
elif isinstance(date_value, int):
|
|
13
|
+
return datetime.fromtimestamp(date_value / 1000)
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
timestamp = float(date_value)
|
|
17
|
+
return datetime.fromtimestamp(timestamp)
|
|
18
|
+
except ValueError:
|
|
19
|
+
return parser.parse(date_value)
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from datetime import date, datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
6
|
+
|
|
7
|
+
from dateutil import parser
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
ConnectionConfig,
|
|
14
|
+
FileData,
|
|
15
|
+
UploadContent,
|
|
16
|
+
Uploader,
|
|
17
|
+
UploaderConfig,
|
|
18
|
+
UploadStager,
|
|
19
|
+
UploadStagerConfig,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
|
+
DestinationRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from weaviate import Client
|
|
28
|
+
|
|
29
|
+
CONNECTOR_TYPE = "weaviate"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class WeaviateAccessConfig(AccessConfig):
|
|
34
|
+
access_token: Optional[str] = None
|
|
35
|
+
api_key: Optional[str] = None
|
|
36
|
+
client_secret: Optional[str] = None
|
|
37
|
+
password: Optional[str] = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class WeaviateConnectionConfig(ConnectionConfig):
|
|
42
|
+
host_url: str
|
|
43
|
+
class_name: str
|
|
44
|
+
access_config: WeaviateAccessConfig = enhanced_field(sensitive=True)
|
|
45
|
+
username: Optional[str] = None
|
|
46
|
+
anonymous: bool = False
|
|
47
|
+
scope: Optional[list[str]] = None
|
|
48
|
+
refresh_token: Optional[str] = None
|
|
49
|
+
connector_type: str = CONNECTOR_TYPE
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class WeaviateUploadStagerConfig(UploadStagerConfig):
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class WeaviateUploadStager(UploadStager):
|
|
59
|
+
upload_stager_config: WeaviateUploadStagerConfig = field(
|
|
60
|
+
default_factory=lambda: WeaviateUploadStagerConfig()
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def parse_date_string(date_string: str) -> date:
|
|
65
|
+
try:
|
|
66
|
+
timestamp = float(date_string)
|
|
67
|
+
return datetime.fromtimestamp(timestamp)
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.debug(f"date {date_string} string not a timestamp: {e}")
|
|
70
|
+
return parser.parse(date_string)
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def conform_dict(cls, data: dict) -> None:
|
|
74
|
+
"""
|
|
75
|
+
Updates the element dictionary to conform to the Weaviate schema
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
# Dict as string formatting
|
|
79
|
+
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
|
|
80
|
+
# Explicit casting otherwise fails schema type checking
|
|
81
|
+
data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
|
|
82
|
+
|
|
83
|
+
# Array of items as string formatting
|
|
84
|
+
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
85
|
+
data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
|
|
86
|
+
|
|
87
|
+
if links := data.get("metadata", {}).get("links", {}):
|
|
88
|
+
data["metadata"]["links"] = str(json.dumps(links))
|
|
89
|
+
|
|
90
|
+
if permissions_data := (
|
|
91
|
+
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
|
92
|
+
):
|
|
93
|
+
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
|
|
94
|
+
|
|
95
|
+
# Datetime formatting
|
|
96
|
+
if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
|
|
97
|
+
data["metadata"]["data_source"]["date_created"] = cls.parse_date_string(
|
|
98
|
+
date_created
|
|
99
|
+
).strftime(
|
|
100
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
|
|
104
|
+
data["metadata"]["data_source"]["date_modified"] = cls.parse_date_string(
|
|
105
|
+
date_modified
|
|
106
|
+
).strftime(
|
|
107
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
|
|
111
|
+
data["metadata"]["data_source"]["date_processed"] = cls.parse_date_string(
|
|
112
|
+
date_processed
|
|
113
|
+
).strftime(
|
|
114
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
if last_modified := data.get("metadata", {}).get("last_modified"):
|
|
118
|
+
data["metadata"]["last_modified"] = cls.parse_date_string(last_modified).strftime(
|
|
119
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# String casting
|
|
123
|
+
if version := data.get("metadata", {}).get("data_source", {}).get("version"):
|
|
124
|
+
data["metadata"]["data_source"]["version"] = str(version)
|
|
125
|
+
|
|
126
|
+
if page_number := data.get("metadata", {}).get("page_number"):
|
|
127
|
+
data["metadata"]["page_number"] = str(page_number)
|
|
128
|
+
|
|
129
|
+
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
130
|
+
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
|
131
|
+
|
|
132
|
+
def run(
|
|
133
|
+
self,
|
|
134
|
+
elements_filepath: Path,
|
|
135
|
+
file_data: FileData,
|
|
136
|
+
output_dir: Path,
|
|
137
|
+
output_filename: str,
|
|
138
|
+
**kwargs: Any,
|
|
139
|
+
) -> Path:
|
|
140
|
+
with open(elements_filepath) as elements_file:
|
|
141
|
+
elements_contents = json.load(elements_file)
|
|
142
|
+
for element in elements_contents:
|
|
143
|
+
self.conform_dict(data=element)
|
|
144
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
145
|
+
with open(output_path, "w") as output_file:
|
|
146
|
+
json.dump(elements_contents, output_file)
|
|
147
|
+
return output_path
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass
|
|
151
|
+
class WeaviateUploaderConfig(UploaderConfig):
|
|
152
|
+
batch_size: int = 100
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class WeaviateUploader(Uploader):
|
|
157
|
+
upload_config: WeaviateUploaderConfig
|
|
158
|
+
connection_config: WeaviateConnectionConfig
|
|
159
|
+
client: Optional["Client"] = field(init=False)
|
|
160
|
+
connector_type: str = CONNECTOR_TYPE
|
|
161
|
+
|
|
162
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
163
|
+
def __post_init__(self):
|
|
164
|
+
from weaviate import Client
|
|
165
|
+
|
|
166
|
+
auth = self._resolve_auth_method()
|
|
167
|
+
self.client = Client(url=self.connection_config.host_url, auth_client_secret=auth)
|
|
168
|
+
|
|
169
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
170
|
+
def _resolve_auth_method(self):
|
|
171
|
+
access_configs = self.connection_config.access_config
|
|
172
|
+
connection_config = self.connection_config
|
|
173
|
+
if connection_config.anonymous:
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
if access_configs.access_token:
|
|
177
|
+
from weaviate.auth import AuthBearerToken
|
|
178
|
+
|
|
179
|
+
return AuthBearerToken(
|
|
180
|
+
access_token=access_configs.access_token,
|
|
181
|
+
refresh_token=connection_config.refresh_token,
|
|
182
|
+
)
|
|
183
|
+
elif access_configs.api_key:
|
|
184
|
+
from weaviate.auth import AuthApiKey
|
|
185
|
+
|
|
186
|
+
return AuthApiKey(api_key=access_configs.api_key)
|
|
187
|
+
elif access_configs.client_secret:
|
|
188
|
+
from weaviate.auth import AuthClientCredentials
|
|
189
|
+
|
|
190
|
+
return AuthClientCredentials(
|
|
191
|
+
client_secret=access_configs.client_secret, scope=connection_config.scope
|
|
192
|
+
)
|
|
193
|
+
elif connection_config.username and access_configs.password:
|
|
194
|
+
from weaviate.auth import AuthClientPassword
|
|
195
|
+
|
|
196
|
+
return AuthClientPassword(
|
|
197
|
+
username=connection_config.username,
|
|
198
|
+
password=access_configs.password,
|
|
199
|
+
scope=connection_config.scope,
|
|
200
|
+
)
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
204
|
+
# TODO update to use async support in weaviate client
|
|
205
|
+
# once the version can be bumped to include it
|
|
206
|
+
elements_dict = []
|
|
207
|
+
for content in contents:
|
|
208
|
+
with open(content.path) as elements_file:
|
|
209
|
+
elements = json.load(elements_file)
|
|
210
|
+
elements_dict.extend(elements)
|
|
211
|
+
|
|
212
|
+
logger.info(
|
|
213
|
+
f"writing {len(elements_dict)} objects to destination "
|
|
214
|
+
f"class {self.connection_config.class_name} "
|
|
215
|
+
f"at {self.connection_config.host_url}",
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
self.client.batch.configure(batch_size=self.upload_config.batch_size)
|
|
219
|
+
with self.client.batch as b:
|
|
220
|
+
for e in elements_dict:
|
|
221
|
+
vector = e.pop("embeddings", None)
|
|
222
|
+
b.add_data_object(
|
|
223
|
+
e,
|
|
224
|
+
self.connection_config.class_name,
|
|
225
|
+
vector=vector,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
weaviate_destination_entry = DestinationRegistryEntry(
|
|
230
|
+
connection_config=WeaviateConnectionConfig,
|
|
231
|
+
uploader=WeaviateUploader,
|
|
232
|
+
uploader_config=WeaviateUploaderConfig,
|
|
233
|
+
upload_stager=WeaviateUploadStager,
|
|
234
|
+
upload_stager_config=WeaviateUploadStagerConfig,
|
|
235
|
+
)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from unstructured.documents.elements import Element
|
|
7
|
+
from unstructured.embed.interfaces import BaseEmbeddingEncoder
|
|
8
|
+
from unstructured.staging.base import elements_from_json
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
11
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class EmbedderConfig(EnhancedDataClassJsonMixin):
|
|
16
|
+
embedding_provider: Optional[str] = None
|
|
17
|
+
embedding_api_key: Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
18
|
+
embedding_model_name: Optional[str] = None
|
|
19
|
+
embedding_aws_access_key_id: Optional[str] = None
|
|
20
|
+
embedding_aws_secret_access_key: Optional[str] = None
|
|
21
|
+
embedding_aws_region: Optional[str] = None
|
|
22
|
+
|
|
23
|
+
def get_embedder(self) -> BaseEmbeddingEncoder:
|
|
24
|
+
kwargs: dict[str, Any] = {}
|
|
25
|
+
if self.embedding_api_key:
|
|
26
|
+
kwargs["api_key"] = self.embedding_api_key
|
|
27
|
+
if self.embedding_model_name:
|
|
28
|
+
kwargs["model_name"] = self.embedding_model_name
|
|
29
|
+
# TODO make this more dynamic to map to encoder configs
|
|
30
|
+
if self.embedding_provider == "langchain-openai":
|
|
31
|
+
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
32
|
+
|
|
33
|
+
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
|
|
34
|
+
elif self.embedding_provider == "langchain-huggingface":
|
|
35
|
+
from unstructured.embed.huggingface import (
|
|
36
|
+
HuggingFaceEmbeddingConfig,
|
|
37
|
+
HuggingFaceEmbeddingEncoder,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
|
|
41
|
+
elif self.embedding_provider == "octoai":
|
|
42
|
+
from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
43
|
+
|
|
44
|
+
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
|
|
45
|
+
elif self.embedding_provider == "langchain-aws-bedrock":
|
|
46
|
+
from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
|
|
47
|
+
|
|
48
|
+
return BedrockEmbeddingEncoder(
|
|
49
|
+
config=BedrockEmbeddingConfig(
|
|
50
|
+
aws_access_key_id=self.embedding_aws_access_key_id,
|
|
51
|
+
aws_secret_access_key=self.embedding_aws_secret_access_key,
|
|
52
|
+
region_name=self.embedding_aws_region,
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
elif self.embedding_provider == "langchain-vertexai":
|
|
56
|
+
from unstructured.embed.vertexai import (
|
|
57
|
+
VertexAIEmbeddingConfig,
|
|
58
|
+
VertexAIEmbeddingEncoder,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError(f"{self.embedding_provider} not a recognized encoder")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class Embedder(BaseProcess, ABC):
|
|
68
|
+
config: EmbedderConfig
|
|
69
|
+
|
|
70
|
+
def run(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
|
|
71
|
+
# TODO update base embedder classes to support async
|
|
72
|
+
embedder = self.config.get_embedder()
|
|
73
|
+
elements = elements_from_json(filename=str(elements_filepath))
|
|
74
|
+
if not elements:
|
|
75
|
+
return elements
|
|
76
|
+
return embedder.embed_documents(elements=elements)
|