unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from time import time
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
|
+
|
|
7
|
+
from dateutil import parser
|
|
8
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.interfaces import (
|
|
14
|
+
AccessConfig,
|
|
15
|
+
ConnectionConfig,
|
|
16
|
+
Downloader,
|
|
17
|
+
DownloaderConfig,
|
|
18
|
+
DownloadResponse,
|
|
19
|
+
FileData,
|
|
20
|
+
Indexer,
|
|
21
|
+
IndexerConfig,
|
|
22
|
+
SourceIdentifiers,
|
|
23
|
+
download_responses,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.logger import logger
|
|
26
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
27
|
+
SourceRegistryEntry,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from office365.graph_client import GraphClient
|
|
32
|
+
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
33
|
+
|
|
34
|
+
CONNECTOR_TYPE = "onedrive"
|
|
35
|
+
MAX_MB_SIZE = 512_000_000
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class OnedriveAccessConfig(AccessConfig):
|
|
40
|
+
client_cred: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class OnedriveConnectionConfig(ConnectionConfig):
|
|
45
|
+
client_id: str
|
|
46
|
+
user_pname: str
|
|
47
|
+
tenant: str = field(repr=False)
|
|
48
|
+
authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
|
|
49
|
+
access_config: OnedriveAccessConfig = enhanced_field(sensitive=True)
|
|
50
|
+
|
|
51
|
+
@requires_dependencies(["msal"], extras="onedrive")
|
|
52
|
+
def get_token(self):
|
|
53
|
+
from msal import ConfidentialClientApplication
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
app = ConfidentialClientApplication(
|
|
57
|
+
authority=f"{self.authority_url}/{self.tenant}",
|
|
58
|
+
client_id=self.client_id,
|
|
59
|
+
client_credential=self.access_config.client_cred,
|
|
60
|
+
)
|
|
61
|
+
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
|
62
|
+
except ValueError as exc:
|
|
63
|
+
logger.error("Couldn't set up credentials for OneDrive")
|
|
64
|
+
raise exc
|
|
65
|
+
if "error" in token:
|
|
66
|
+
raise SourceConnectionNetworkError(
|
|
67
|
+
"failed to fetch token, {}: {}".format(token["error"], token["error_description"])
|
|
68
|
+
)
|
|
69
|
+
return token
|
|
70
|
+
|
|
71
|
+
@requires_dependencies(["office365"], extras="onedrive")
|
|
72
|
+
def get_client(self) -> "GraphClient":
|
|
73
|
+
from office365.graph_client import GraphClient
|
|
74
|
+
|
|
75
|
+
client = GraphClient(self.get_token)
|
|
76
|
+
return client
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class OnedriveIndexerConfig(IndexerConfig):
|
|
81
|
+
path: Optional[str] = field(default="")
|
|
82
|
+
recursive: bool = False
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class OnedriveIndexer(Indexer):
|
|
87
|
+
connection_config: OnedriveConnectionConfig
|
|
88
|
+
index_config: OnedriveIndexerConfig
|
|
89
|
+
|
|
90
|
+
def list_objects(self, folder, recursive) -> list["DriveItem"]:
|
|
91
|
+
drive_items = folder.children.get().execute_query()
|
|
92
|
+
files = [d for d in drive_items if d.is_file]
|
|
93
|
+
if not recursive:
|
|
94
|
+
return files
|
|
95
|
+
folders = [d for d in drive_items if d.is_folder]
|
|
96
|
+
for f in folders:
|
|
97
|
+
files.extend(self.list_objects(f, recursive))
|
|
98
|
+
return files
|
|
99
|
+
|
|
100
|
+
def get_root(self, client: "GraphClient") -> "DriveItem":
|
|
101
|
+
root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
|
|
102
|
+
if fpath := self.index_config.path:
|
|
103
|
+
root = root.get_by_path(fpath).get().execute_query()
|
|
104
|
+
if root is None or not root.is_folder:
|
|
105
|
+
raise ValueError(f"Unable to find directory, given: {fpath}")
|
|
106
|
+
return root
|
|
107
|
+
|
|
108
|
+
def get_properties(self, drive_item: "DriveItem") -> dict:
|
|
109
|
+
properties = drive_item.properties
|
|
110
|
+
filtered_properties = {}
|
|
111
|
+
for k, v in properties.items():
|
|
112
|
+
try:
|
|
113
|
+
json.dumps(v)
|
|
114
|
+
filtered_properties[k] = v
|
|
115
|
+
except TypeError:
|
|
116
|
+
pass
|
|
117
|
+
return filtered_properties
|
|
118
|
+
|
|
119
|
+
def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData:
|
|
120
|
+
file_path = drive_item.parent_reference.path.split(":")[-1]
|
|
121
|
+
file_path = file_path[1:] if file_path and file_path[0] == "/" else file_path
|
|
122
|
+
filename = drive_item.name
|
|
123
|
+
server_path = file_path + "/" + filename
|
|
124
|
+
rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
|
|
125
|
+
date_modified_dt = (
|
|
126
|
+
parser.parse(drive_item.last_modified_datetime)
|
|
127
|
+
if drive_item.last_modified_datetime
|
|
128
|
+
else None
|
|
129
|
+
)
|
|
130
|
+
date_created_at = (
|
|
131
|
+
parser.parse(drive_item.created_datetime) if drive_item.created_datetime else None
|
|
132
|
+
)
|
|
133
|
+
return FileData(
|
|
134
|
+
identifier=drive_item.id,
|
|
135
|
+
connector_type=CONNECTOR_TYPE,
|
|
136
|
+
source_identifiers=SourceIdentifiers(
|
|
137
|
+
fullpath=server_path, filename=drive_item.name, rel_path=rel_path
|
|
138
|
+
),
|
|
139
|
+
metadata=DataSourceMetadata(
|
|
140
|
+
url=drive_item.parent_reference.path + "/" + drive_item.name,
|
|
141
|
+
version=drive_item.etag,
|
|
142
|
+
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
143
|
+
date_created=str(date_created_at.timestamp()) if date_modified_dt else None,
|
|
144
|
+
date_processed=str(time()),
|
|
145
|
+
record_locator={
|
|
146
|
+
"user_pname": self.connection_config.user_pname,
|
|
147
|
+
"server_relative_path": server_path,
|
|
148
|
+
},
|
|
149
|
+
),
|
|
150
|
+
additional_metadata=self.get_properties(drive_item=drive_item),
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
154
|
+
client = self.connection_config.get_client()
|
|
155
|
+
root = self.get_root(client=client)
|
|
156
|
+
drive_items = self.list_objects(folder=root, recursive=self.index_config.recursive)
|
|
157
|
+
for drive_item in drive_items:
|
|
158
|
+
file_data = self.drive_item_to_file_data(drive_item=drive_item)
|
|
159
|
+
yield file_data
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@dataclass
|
|
163
|
+
class OnedriveDownloaderConfig(DownloaderConfig):
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@dataclass
|
|
168
|
+
class OnedriveDownloader(Downloader):
|
|
169
|
+
connection_config: OnedriveConnectionConfig
|
|
170
|
+
download_config: OnedriveDownloaderConfig
|
|
171
|
+
|
|
172
|
+
@SourceConnectionNetworkError.wrap
|
|
173
|
+
def _fetch_file(self, file_data: FileData):
|
|
174
|
+
if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
f"file data doesn't have enough information to get "
|
|
177
|
+
f"file content: {file_data.to_dict()}"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
server_relative_path = file_data.source_identifiers.fullpath
|
|
181
|
+
client = self.connection_config.get_client()
|
|
182
|
+
root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
|
|
183
|
+
file = root.get_by_path(server_relative_path).get().execute_query()
|
|
184
|
+
if not file:
|
|
185
|
+
raise FileNotFoundError(f"file not found: {server_relative_path}")
|
|
186
|
+
return file
|
|
187
|
+
|
|
188
|
+
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
189
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
190
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
191
|
+
return self.download_dir / Path(rel_path)
|
|
192
|
+
|
|
193
|
+
@SourceConnectionError.wrap
|
|
194
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
195
|
+
file = self._fetch_file(file_data=file_data)
|
|
196
|
+
fsize = file.get_property("size", 0)
|
|
197
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
198
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
logger.info(f"Downloading {file_data.source_identifiers.fullpath} to {download_path}")
|
|
200
|
+
if fsize > MAX_MB_SIZE:
|
|
201
|
+
logger.info(f"Downloading file with size: {fsize} bytes in chunks")
|
|
202
|
+
with download_path.open(mode="wb") as f:
|
|
203
|
+
file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
|
|
204
|
+
else:
|
|
205
|
+
with download_path.open(mode="wb") as f:
|
|
206
|
+
file.download(f).execute_query()
|
|
207
|
+
return DownloadResponse(file_data=file_data, path=download_path)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
onedrive_source_entry = SourceRegistryEntry(
|
|
211
|
+
connection_config=OnedriveConnectionConfig,
|
|
212
|
+
indexer_config=OnedriveIndexerConfig,
|
|
213
|
+
indexer=OnedriveIndexer,
|
|
214
|
+
downloader_config=OnedriveDownloaderConfig,
|
|
215
|
+
downloader=OnedriveDownloader,
|
|
216
|
+
)
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import TYPE_CHECKING, Optional
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
5
|
+
from unstructured_ingest.error import (
|
|
6
|
+
DestinationConnectionError,
|
|
7
|
+
)
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.interfaces import (
|
|
10
|
+
AccessConfig,
|
|
11
|
+
ConnectionConfig,
|
|
12
|
+
)
|
|
13
|
+
from unstructured_ingest.v2.logger import logger
|
|
14
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
|
+
DestinationRegistryEntry,
|
|
16
|
+
SourceRegistryEntry,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.v2.processes.connectors.elasticsearch import (
|
|
19
|
+
ElasticsearchDownloader,
|
|
20
|
+
ElasticsearchDownloaderConfig,
|
|
21
|
+
ElasticsearchIndexer,
|
|
22
|
+
ElasticsearchIndexerConfig,
|
|
23
|
+
ElasticsearchUploader,
|
|
24
|
+
ElasticsearchUploaderConfig,
|
|
25
|
+
ElasticsearchUploadStager,
|
|
26
|
+
ElasticsearchUploadStagerConfig,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from opensearchpy import OpenSearch
|
|
31
|
+
|
|
32
|
+
CONNECTOR_TYPE = "opensearch"
|
|
33
|
+
|
|
34
|
+
"""Since the actual OpenSearch project is a fork of Elasticsearch, we are relying
|
|
35
|
+
heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class OpenSearchAccessConfig(AccessConfig):
|
|
40
|
+
password: Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
41
|
+
use_ssl: bool = False
|
|
42
|
+
verify_certs: bool = False
|
|
43
|
+
ssl_show_warn: bool = False
|
|
44
|
+
ca_certs: Optional[str] = None
|
|
45
|
+
client_cert: Optional[str] = None
|
|
46
|
+
client_key: Optional[str] = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class OpenSearchClientInput(EnhancedDataClassJsonMixin):
|
|
51
|
+
http_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None)
|
|
52
|
+
hosts: Optional[list[str]] = None
|
|
53
|
+
use_ssl: bool = False
|
|
54
|
+
verify_certs: bool = False
|
|
55
|
+
ssl_show_warn: bool = False
|
|
56
|
+
ca_certs: Optional[str] = None
|
|
57
|
+
client_cert: Optional[str] = None
|
|
58
|
+
client_key: Optional[str] = None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class OpenSearchConnectionConfig(ConnectionConfig):
|
|
63
|
+
hosts: Optional[list[str]] = None
|
|
64
|
+
username: Optional[str] = None
|
|
65
|
+
access_config: OpenSearchAccessConfig = enhanced_field(sensitive=True)
|
|
66
|
+
|
|
67
|
+
def get_client_kwargs(self) -> dict:
|
|
68
|
+
# Update auth related fields to conform to what the SDK expects based on the
|
|
69
|
+
# supported methods:
|
|
70
|
+
# https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py
|
|
71
|
+
client_input = OpenSearchClientInput()
|
|
72
|
+
if self.hosts:
|
|
73
|
+
client_input.hosts = self.hosts
|
|
74
|
+
if self.access_config.use_ssl:
|
|
75
|
+
client_input.use_ssl = self.access_config.use_ssl
|
|
76
|
+
if self.access_config.verify_certs:
|
|
77
|
+
client_input.verify_certs = self.access_config.verify_certs
|
|
78
|
+
if self.access_config.ssl_show_warn:
|
|
79
|
+
client_input.ssl_show_warn = self.access_config.ssl_show_warn
|
|
80
|
+
if self.access_config.ca_certs:
|
|
81
|
+
client_input.ca_certs = self.access_config.ca_certs
|
|
82
|
+
if self.access_config.client_cert:
|
|
83
|
+
client_input.client_cert = self.access_config.client_cert
|
|
84
|
+
if self.access_config.client_key:
|
|
85
|
+
client_input.client_key = self.access_config.client_key
|
|
86
|
+
if self.username and self.access_config.password:
|
|
87
|
+
client_input.http_auth = (self.username, self.access_config.password)
|
|
88
|
+
logger.debug(
|
|
89
|
+
f"OpenSearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}"
|
|
90
|
+
)
|
|
91
|
+
client_kwargs = client_input.to_dict(redact_sensitive=False)
|
|
92
|
+
client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
|
|
93
|
+
return client_kwargs
|
|
94
|
+
|
|
95
|
+
@DestinationConnectionError.wrap
|
|
96
|
+
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
97
|
+
def get_client(self) -> "OpenSearch":
|
|
98
|
+
from opensearchpy import OpenSearch
|
|
99
|
+
|
|
100
|
+
return OpenSearch(**self.get_client_kwargs())
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class OpenSearchIndexer(ElasticsearchIndexer):
|
|
105
|
+
connection_config: OpenSearchConnectionConfig
|
|
106
|
+
client: "OpenSearch" = field(init=False)
|
|
107
|
+
|
|
108
|
+
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
109
|
+
def load_scan(self):
|
|
110
|
+
from opensearchpy.helpers import scan
|
|
111
|
+
|
|
112
|
+
return scan
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class OpenSearchDownloader(ElasticsearchDownloader):
|
|
117
|
+
connection_config: OpenSearchConnectionConfig
|
|
118
|
+
connector_type: str = CONNECTOR_TYPE
|
|
119
|
+
|
|
120
|
+
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
121
|
+
def load_async(self):
|
|
122
|
+
from opensearchpy import AsyncOpenSearch
|
|
123
|
+
from opensearchpy.helpers import async_scan
|
|
124
|
+
|
|
125
|
+
return AsyncOpenSearch, async_scan
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass
|
|
129
|
+
class OpenSearchUploader(ElasticsearchUploader):
|
|
130
|
+
connection_config: OpenSearchConnectionConfig
|
|
131
|
+
connector_type: str = CONNECTOR_TYPE
|
|
132
|
+
|
|
133
|
+
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
134
|
+
def load_parallel_bulk(self):
|
|
135
|
+
from opensearchpy.helpers import parallel_bulk
|
|
136
|
+
|
|
137
|
+
return parallel_bulk
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
opensearch_source_entry = SourceRegistryEntry(
|
|
141
|
+
connection_config=OpenSearchConnectionConfig,
|
|
142
|
+
indexer=OpenSearchIndexer,
|
|
143
|
+
indexer_config=ElasticsearchIndexerConfig,
|
|
144
|
+
downloader=OpenSearchDownloader,
|
|
145
|
+
downloader_config=ElasticsearchDownloaderConfig,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
opensearch_destination_entry = DestinationRegistryEntry(
|
|
150
|
+
connection_config=OpenSearchConnectionConfig,
|
|
151
|
+
upload_stager_config=ElasticsearchUploadStagerConfig,
|
|
152
|
+
upload_stager=ElasticsearchUploadStager,
|
|
153
|
+
uploader_config=ElasticsearchUploaderConfig,
|
|
154
|
+
uploader=OpenSearchUploader,
|
|
155
|
+
)
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import multiprocessing as mp
|
|
3
|
+
import uuid
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
7
|
+
|
|
8
|
+
from unstructured.ingest.v2.logger import logger
|
|
9
|
+
from unstructured.ingest.v2.processes.connector_registry import (
|
|
10
|
+
DestinationRegistryEntry,
|
|
11
|
+
)
|
|
12
|
+
from unstructured.staging.base import flatten_dict
|
|
13
|
+
from unstructured.utils import requires_dependencies
|
|
14
|
+
|
|
15
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
16
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
17
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
18
|
+
from unstructured_ingest.v2.interfaces import (
|
|
19
|
+
AccessConfig,
|
|
20
|
+
ConnectionConfig,
|
|
21
|
+
UploadContent,
|
|
22
|
+
Uploader,
|
|
23
|
+
UploaderConfig,
|
|
24
|
+
UploadStager,
|
|
25
|
+
UploadStagerConfig,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from pinecone import Index as PineconeIndex
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
CONNECTOR_TYPE = "pinecone"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class PineconeAccessConfig(AccessConfig):
|
|
37
|
+
api_key: Optional[str] = enhanced_field(default=None, overload_name="pinecone_api_key")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class PineconeConnectionConfig(ConnectionConfig):
|
|
42
|
+
index_name: str
|
|
43
|
+
environment: str
|
|
44
|
+
access_config: PineconeAccessConfig = enhanced_field(sensitive=True)
|
|
45
|
+
|
|
46
|
+
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
47
|
+
def get_index(self) -> "PineconeIndex":
|
|
48
|
+
from pinecone import Pinecone
|
|
49
|
+
from unstructured import __version__ as unstructured_version
|
|
50
|
+
|
|
51
|
+
pc = Pinecone(
|
|
52
|
+
api_key=self.access_config.api_key,
|
|
53
|
+
source_tag=f"unstructured=={unstructured_version}",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
index = pc.Index(self.index_name)
|
|
57
|
+
logger.debug(f"Connected to index: {pc.describe_index(self.index_name)}")
|
|
58
|
+
return index
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class PineconeUploadStagerConfig(UploadStagerConfig):
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class PineconeUploaderConfig(UploaderConfig):
|
|
68
|
+
batch_size: int = 100
|
|
69
|
+
num_of_processes: int = 4
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class PineconeUploadStager(UploadStager):
|
|
74
|
+
upload_stager_config: PineconeUploadStagerConfig = field(
|
|
75
|
+
default_factory=lambda: PineconeUploadStagerConfig()
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def conform_dict(element_dict: dict) -> dict:
|
|
80
|
+
# While flatten_dict enables indexing on various fields,
|
|
81
|
+
# element_serialized enables easily reloading the element object to memory.
|
|
82
|
+
# element_serialized is formed without text/embeddings to avoid data bloating.
|
|
83
|
+
return {
|
|
84
|
+
"id": str(uuid.uuid4()),
|
|
85
|
+
"values": element_dict.pop("embeddings", None),
|
|
86
|
+
"metadata": {
|
|
87
|
+
"text": element_dict.pop("text", None),
|
|
88
|
+
"element_serialized": json.dumps(element_dict),
|
|
89
|
+
**flatten_dict(
|
|
90
|
+
element_dict,
|
|
91
|
+
separator="-",
|
|
92
|
+
flatten_lists=True,
|
|
93
|
+
remove_none=True,
|
|
94
|
+
),
|
|
95
|
+
},
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
def run(
|
|
99
|
+
self,
|
|
100
|
+
elements_filepath: Path,
|
|
101
|
+
output_dir: Path,
|
|
102
|
+
output_filename: str,
|
|
103
|
+
**kwargs: Any,
|
|
104
|
+
) -> Path:
|
|
105
|
+
with open(elements_filepath) as elements_file:
|
|
106
|
+
elements_contents = json.load(elements_file)
|
|
107
|
+
|
|
108
|
+
conformed_elements = [
|
|
109
|
+
self.conform_dict(element_dict=element) for element in elements_contents
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
113
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
|
|
115
|
+
with open(output_path, "w") as output_file:
|
|
116
|
+
json.dump(conformed_elements, output_file)
|
|
117
|
+
return output_path
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class PineconeUploader(Uploader):
|
|
122
|
+
upload_config: PineconeUploaderConfig
|
|
123
|
+
connection_config: PineconeConnectionConfig
|
|
124
|
+
connector_type: str = CONNECTOR_TYPE
|
|
125
|
+
|
|
126
|
+
@DestinationConnectionError.wrap
|
|
127
|
+
def check_connection(self):
|
|
128
|
+
_ = self.connection_config.get_index()
|
|
129
|
+
|
|
130
|
+
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
131
|
+
def upsert_batch(self, batch):
|
|
132
|
+
from pinecone.exceptions import PineconeApiException
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
index = self.connection_config.get_index()
|
|
136
|
+
response = index.upsert(batch)
|
|
137
|
+
except PineconeApiException as api_error:
|
|
138
|
+
raise DestinationConnectionError(f"http error: {api_error}") from api_error
|
|
139
|
+
logger.debug(f"results: {response}")
|
|
140
|
+
|
|
141
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
142
|
+
|
|
143
|
+
elements_dict = []
|
|
144
|
+
for content in contents:
|
|
145
|
+
with open(content.path) as elements_file:
|
|
146
|
+
elements = json.load(elements_file)
|
|
147
|
+
elements_dict.extend(elements)
|
|
148
|
+
|
|
149
|
+
logger.info(
|
|
150
|
+
f"writing document batches to destination"
|
|
151
|
+
f" index named {self.connection_config.index_name}"
|
|
152
|
+
f" environment named {self.connection_config.environment}"
|
|
153
|
+
f" with batch size {self.upload_config.batch_size}"
|
|
154
|
+
f" with {self.upload_config.num_of_processes} (number of) processes"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
pinecone_batch_size = self.upload_config.batch_size
|
|
158
|
+
|
|
159
|
+
if self.upload_config.num_of_processes == 1:
|
|
160
|
+
for batch in batch_generator(elements_dict, pinecone_batch_size):
|
|
161
|
+
self.upsert_batch(batch) # noqa: E203
|
|
162
|
+
|
|
163
|
+
else:
|
|
164
|
+
with mp.Pool(
|
|
165
|
+
processes=self.upload_config.num_of_processes,
|
|
166
|
+
) as pool:
|
|
167
|
+
pool.map(
|
|
168
|
+
self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
pinecone_destination_entry = DestinationRegistryEntry(
|
|
173
|
+
connection_config=PineconeConnectionConfig,
|
|
174
|
+
uploader=PineconeUploader,
|
|
175
|
+
uploader_config=PineconeUploaderConfig,
|
|
176
|
+
upload_stager=PineconeUploadStager,
|
|
177
|
+
upload_stager_config=PineconeUploadStagerConfig,
|
|
178
|
+
)
|