unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.interfaces.connector import AccessConfig
|
|
6
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
7
|
+
from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
|
|
8
|
+
LanceDBRemoteConnectionConfig,
|
|
9
|
+
LanceDBUploader,
|
|
10
|
+
LanceDBUploaderConfig,
|
|
11
|
+
LanceDBUploadStager,
|
|
12
|
+
LanceDBUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "lancedb_gcs"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LanceDBGCSAccessConfig(AccessConfig):
|
|
19
|
+
google_service_account_key: str = Field(
|
|
20
|
+
description="The serialized google service account key."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LanceDBGCSConnectionConfig(LanceDBRemoteConnectionConfig):
|
|
25
|
+
access_config: Secret[LanceDBGCSAccessConfig]
|
|
26
|
+
|
|
27
|
+
def get_storage_options(self) -> dict:
|
|
28
|
+
return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class LanceDBGSPUploader(LanceDBUploader):
|
|
33
|
+
upload_config: LanceDBUploaderConfig
|
|
34
|
+
connection_config: LanceDBGCSConnectionConfig
|
|
35
|
+
connector_type: str = CONNECTOR_TYPE
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
lancedb_gcp_destination_entry = DestinationRegistryEntry(
|
|
39
|
+
connection_config=LanceDBGCSConnectionConfig,
|
|
40
|
+
uploader=LanceDBGSPUploader,
|
|
41
|
+
uploader_config=LanceDBUploaderConfig,
|
|
42
|
+
upload_stager_config=LanceDBUploadStagerConfig,
|
|
43
|
+
upload_stager=LanceDBUploadStager,
|
|
44
|
+
)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from contextlib import asynccontextmanager
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from pydantic import Field
|
|
13
|
+
|
|
14
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
15
|
+
from unstructured_ingest.logger import logger
|
|
16
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
17
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
18
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
19
|
+
from unstructured_ingest.v2.interfaces.connector import ConnectionConfig
|
|
20
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
21
|
+
from unstructured_ingest.v2.interfaces.upload_stager import UploadStager, UploadStagerConfig
|
|
22
|
+
from unstructured_ingest.v2.interfaces.uploader import Uploader, UploaderConfig
|
|
23
|
+
|
|
24
|
+
CONNECTOR_TYPE = "lancedb"
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from lancedb import AsyncConnection
|
|
28
|
+
from lancedb.table import AsyncTable
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LanceDBConnectionConfig(ConnectionConfig, ABC):
|
|
32
|
+
uri: str = Field(description="The uri of the database.")
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def get_storage_options(self) -> Optional[dict[str, str]]:
|
|
36
|
+
raise NotImplementedError
|
|
37
|
+
|
|
38
|
+
@asynccontextmanager
|
|
39
|
+
@requires_dependencies(["lancedb"], extras="lancedb")
|
|
40
|
+
@DestinationConnectionError.wrap
|
|
41
|
+
async def get_async_connection(self) -> AsyncGenerator["AsyncConnection", None]:
|
|
42
|
+
import lancedb
|
|
43
|
+
|
|
44
|
+
with await lancedb.connect_async(
|
|
45
|
+
self.uri,
|
|
46
|
+
storage_options=self.get_storage_options(),
|
|
47
|
+
) as connection:
|
|
48
|
+
yield connection
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class LanceDBRemoteConnectionConfig(LanceDBConnectionConfig):
|
|
52
|
+
timeout: str = Field(
|
|
53
|
+
default="30s",
|
|
54
|
+
description=(
|
|
55
|
+
"Timeout for the entire request, from connection until the response body has finished"
|
|
56
|
+
"in a [0-9]+(ns|us|ms|[smhdwy]) format."
|
|
57
|
+
),
|
|
58
|
+
pattern=r"[0-9]+(ns|us|ms|[smhdwy])",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class LanceDBUploadStagerConfig(UploadStagerConfig):
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class LanceDBUploadStager(UploadStager):
|
|
68
|
+
upload_stager_config: LanceDBUploadStagerConfig = field(
|
|
69
|
+
default_factory=LanceDBUploadStagerConfig
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def run(
|
|
73
|
+
self,
|
|
74
|
+
elements_filepath: Path,
|
|
75
|
+
file_data: FileData,
|
|
76
|
+
output_dir: Path,
|
|
77
|
+
output_filename: str,
|
|
78
|
+
**kwargs: Any,
|
|
79
|
+
) -> Path:
|
|
80
|
+
with open(elements_filepath) as elements_file:
|
|
81
|
+
elements_contents: list[dict] = json.load(elements_file)
|
|
82
|
+
|
|
83
|
+
df = pd.DataFrame(
|
|
84
|
+
[
|
|
85
|
+
self.conform_dict(element_dict=element_dict, file_data=file_data)
|
|
86
|
+
for element_dict in elements_contents
|
|
87
|
+
]
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
output_path = (output_dir / output_filename).with_suffix(".feather")
|
|
91
|
+
df.to_feather(output_path)
|
|
92
|
+
|
|
93
|
+
return output_path
|
|
94
|
+
|
|
95
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
96
|
+
data = element_dict.copy()
|
|
97
|
+
return {
|
|
98
|
+
"vector": data.pop("embeddings", None),
|
|
99
|
+
RECORD_ID_LABEL: file_data.identifier,
|
|
100
|
+
**flatten_dict(data, separator="-"),
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class LanceDBUploaderConfig(UploaderConfig):
|
|
105
|
+
table_name: str = Field(description="The name of the table.")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class LanceDBUploader(Uploader):
|
|
110
|
+
upload_config: LanceDBUploaderConfig
|
|
111
|
+
connection_config: LanceDBConnectionConfig
|
|
112
|
+
connector_type: str = CONNECTOR_TYPE
|
|
113
|
+
|
|
114
|
+
@DestinationConnectionError.wrap
|
|
115
|
+
def precheck(self):
|
|
116
|
+
async def _precheck() -> None:
|
|
117
|
+
async with self.connection_config.get_async_connection() as conn:
|
|
118
|
+
table = await conn.open_table(self.upload_config.table_name)
|
|
119
|
+
table.close()
|
|
120
|
+
|
|
121
|
+
asyncio.run(_precheck())
|
|
122
|
+
|
|
123
|
+
@asynccontextmanager
|
|
124
|
+
async def get_table(self) -> AsyncGenerator["AsyncTable", None]:
|
|
125
|
+
async with self.connection_config.get_async_connection() as conn:
|
|
126
|
+
table = await conn.open_table(self.upload_config.table_name)
|
|
127
|
+
try:
|
|
128
|
+
yield table
|
|
129
|
+
finally:
|
|
130
|
+
table.close()
|
|
131
|
+
|
|
132
|
+
async def run_async(self, path, file_data, **kwargs):
|
|
133
|
+
df = pd.read_feather(path)
|
|
134
|
+
async with self.get_table() as table:
|
|
135
|
+
schema = await table.schema()
|
|
136
|
+
df = self._fit_to_schema(df, schema)
|
|
137
|
+
if RECORD_ID_LABEL not in schema.names:
|
|
138
|
+
logger.warning(
|
|
139
|
+
f"Designated table doesn't contain {RECORD_ID_LABEL} column of type"
|
|
140
|
+
" string which is required to support overwriting updates on subsequent"
|
|
141
|
+
" uploads of the same record. New rows will be appended instead."
|
|
142
|
+
)
|
|
143
|
+
else:
|
|
144
|
+
await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
|
|
145
|
+
await table.add(data=df)
|
|
146
|
+
|
|
147
|
+
def _fit_to_schema(self, df: pd.DataFrame, schema) -> pd.DataFrame:
|
|
148
|
+
columns = set(df.columns)
|
|
149
|
+
schema_fields = set(schema.names)
|
|
150
|
+
columns_to_drop = columns - schema_fields
|
|
151
|
+
missing_columns = schema_fields - columns
|
|
152
|
+
|
|
153
|
+
if columns_to_drop:
|
|
154
|
+
logger.info(
|
|
155
|
+
"Following columns will be dropped to match the table's schema: "
|
|
156
|
+
f"{', '.join(columns_to_drop)}"
|
|
157
|
+
)
|
|
158
|
+
if missing_columns:
|
|
159
|
+
logger.info(
|
|
160
|
+
"Following null filled columns will be added to match the table's schema:"
|
|
161
|
+
f" {', '.join(missing_columns)} "
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
df = df.drop(columns=columns_to_drop)
|
|
165
|
+
|
|
166
|
+
for column in missing_columns:
|
|
167
|
+
df[column] = pd.Series()
|
|
168
|
+
|
|
169
|
+
return df
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.interfaces.connector import AccessConfig
|
|
6
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
7
|
+
from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
|
|
8
|
+
LanceDBConnectionConfig,
|
|
9
|
+
LanceDBUploader,
|
|
10
|
+
LanceDBUploaderConfig,
|
|
11
|
+
LanceDBUploadStager,
|
|
12
|
+
LanceDBUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "lancedb_local"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LanceDBLocalAccessConfig(AccessConfig):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LanceDBLocalConnectionConfig(LanceDBConnectionConfig):
|
|
23
|
+
access_config: Secret[LanceDBLocalAccessConfig] = Field(
|
|
24
|
+
default_factory=LanceDBLocalAccessConfig, validate_default=True
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def get_storage_options(self) -> None:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class LanceDBLocalUploader(LanceDBUploader):
|
|
33
|
+
upload_config: LanceDBUploaderConfig
|
|
34
|
+
connection_config: LanceDBLocalConnectionConfig
|
|
35
|
+
connector_type: str = CONNECTOR_TYPE
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
lancedb_local_destination_entry = DestinationRegistryEntry(
|
|
39
|
+
connection_config=LanceDBLocalConnectionConfig,
|
|
40
|
+
uploader=LanceDBLocalUploader,
|
|
41
|
+
uploader_config=LanceDBUploaderConfig,
|
|
42
|
+
upload_stager_config=LanceDBUploadStagerConfig,
|
|
43
|
+
upload_stager=LanceDBUploadStager,
|
|
44
|
+
)
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import json
|
|
3
|
+
import shutil
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from time import time
|
|
7
|
+
from typing import Any, Generator
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
ConnectionConfig,
|
|
14
|
+
Downloader,
|
|
15
|
+
DownloaderConfig,
|
|
16
|
+
DownloadResponse,
|
|
17
|
+
FileData,
|
|
18
|
+
FileDataSourceMetadata,
|
|
19
|
+
Indexer,
|
|
20
|
+
IndexerConfig,
|
|
21
|
+
SourceIdentifiers,
|
|
22
|
+
Uploader,
|
|
23
|
+
UploaderConfig,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.logger import logger
|
|
26
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
27
|
+
DestinationRegistryEntry,
|
|
28
|
+
SourceRegistryEntry,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
CONNECTOR_TYPE = "local"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LocalAccessConfig(AccessConfig):
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class LocalConnectionConfig(ConnectionConfig):
|
|
39
|
+
access_config: Secret[LocalAccessConfig] = Field(
|
|
40
|
+
default=LocalAccessConfig(), validate_default=True
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class LocalIndexerConfig(IndexerConfig):
|
|
45
|
+
input_path: Path = Field(
|
|
46
|
+
description="Path to the location in the local file system that will be processed."
|
|
47
|
+
)
|
|
48
|
+
recursive: bool = Field(
|
|
49
|
+
default=False,
|
|
50
|
+
description="Recursively download files in their respective folders "
|
|
51
|
+
"otherwise stop at the files in provided folder level.",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def path(self) -> Path:
|
|
56
|
+
return Path(self.input_path).resolve()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class LocalIndexer(Indexer):
|
|
61
|
+
index_config: LocalIndexerConfig
|
|
62
|
+
connection_config: LocalConnectionConfig = field(
|
|
63
|
+
default_factory=lambda: LocalConnectionConfig()
|
|
64
|
+
)
|
|
65
|
+
connector_type: str = CONNECTOR_TYPE
|
|
66
|
+
|
|
67
|
+
def list_files(self) -> list[Path]:
|
|
68
|
+
input_path = self.index_config.path
|
|
69
|
+
if input_path.is_file():
|
|
70
|
+
return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
|
|
71
|
+
files = []
|
|
72
|
+
if self.index_config.recursive:
|
|
73
|
+
files.extend(list(input_path.rglob("*")))
|
|
74
|
+
else:
|
|
75
|
+
files.extend(list(input_path.glob("*")))
|
|
76
|
+
return [f for f in files if f.is_file()]
|
|
77
|
+
|
|
78
|
+
def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
|
|
79
|
+
stats = path.stat()
|
|
80
|
+
try:
|
|
81
|
+
date_modified = str(stats.st_mtime)
|
|
82
|
+
except Exception as e:
|
|
83
|
+
logger.warning(f"Couldn't detect date modified: {e}")
|
|
84
|
+
date_modified = None
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
date_created = str(stats.st_birthtime)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.warning(f"Couldn't detect date created: {e}")
|
|
90
|
+
date_created = None
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
mode = stats.st_mode
|
|
94
|
+
permissions_data = [{"mode": mode}]
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.warning(f"Couldn't detect file mode: {e}")
|
|
97
|
+
permissions_data = None
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
filesize_bytes = stats.st_size
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.warning(f"Couldn't detect file size: {e}")
|
|
103
|
+
filesize_bytes = None
|
|
104
|
+
|
|
105
|
+
return FileDataSourceMetadata(
|
|
106
|
+
date_modified=date_modified,
|
|
107
|
+
date_created=date_created,
|
|
108
|
+
date_processed=str(time()),
|
|
109
|
+
permissions_data=permissions_data,
|
|
110
|
+
record_locator={"path": str(path.resolve())},
|
|
111
|
+
filesize_bytes=filesize_bytes,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
115
|
+
for file_path in self.list_files():
|
|
116
|
+
file_data = FileData(
|
|
117
|
+
identifier=str(file_path.resolve()),
|
|
118
|
+
connector_type=CONNECTOR_TYPE,
|
|
119
|
+
source_identifiers=SourceIdentifiers(
|
|
120
|
+
fullpath=str(file_path.resolve()),
|
|
121
|
+
filename=file_path.name,
|
|
122
|
+
rel_path=(
|
|
123
|
+
str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[
|
|
124
|
+
1:
|
|
125
|
+
]
|
|
126
|
+
if not self.index_config.path.is_file()
|
|
127
|
+
else self.index_config.path.name
|
|
128
|
+
),
|
|
129
|
+
),
|
|
130
|
+
metadata=self.get_file_metadata(path=file_path),
|
|
131
|
+
)
|
|
132
|
+
yield file_data
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class LocalDownloaderConfig(DownloaderConfig):
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass
|
|
140
|
+
class LocalDownloader(Downloader):
|
|
141
|
+
connector_type: str = CONNECTOR_TYPE
|
|
142
|
+
connection_config: LocalConnectionConfig = field(default_factory=LocalConnectionConfig)
|
|
143
|
+
download_config: LocalDownloaderConfig = field(default_factory=LocalDownloaderConfig)
|
|
144
|
+
|
|
145
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
146
|
+
return Path(file_data.source_identifiers.fullpath)
|
|
147
|
+
|
|
148
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
149
|
+
return DownloadResponse(
|
|
150
|
+
file_data=file_data, path=Path(file_data.source_identifiers.fullpath)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class LocalUploaderConfig(UploaderConfig):
|
|
155
|
+
output_dir: str = Field(
|
|
156
|
+
default="structured-output", description="Local path to write partitioned output to"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def output_path(self) -> Path:
|
|
161
|
+
return Path(self.output_dir).resolve()
|
|
162
|
+
|
|
163
|
+
def __post_init__(self):
|
|
164
|
+
if self.output_path.exists() and self.output_path.is_file():
|
|
165
|
+
raise ValueError("output path already exists as a file")
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@dataclass
|
|
169
|
+
class LocalUploader(Uploader):
|
|
170
|
+
connector_type: str = CONNECTOR_TYPE
|
|
171
|
+
upload_config: LocalUploaderConfig = field(default_factory=LocalUploaderConfig)
|
|
172
|
+
connection_config: LocalConnectionConfig = field(
|
|
173
|
+
default_factory=lambda: LocalConnectionConfig()
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def is_async(self) -> bool:
|
|
177
|
+
return False
|
|
178
|
+
|
|
179
|
+
def get_destination_path(self, file_data: FileData) -> Path:
|
|
180
|
+
if source_identifiers := file_data.source_identifiers:
|
|
181
|
+
rel_path = (
|
|
182
|
+
source_identifiers.relative_path[1:]
|
|
183
|
+
if source_identifiers.relative_path.startswith("/")
|
|
184
|
+
else source_identifiers.relative_path
|
|
185
|
+
)
|
|
186
|
+
new_path = self.upload_config.output_path / Path(rel_path)
|
|
187
|
+
final_path = str(new_path).replace(
|
|
188
|
+
source_identifiers.filename, f"{source_identifiers.filename}.json"
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
|
|
192
|
+
final_path = Path(final_path)
|
|
193
|
+
final_path.parent.mkdir(parents=True, exist_ok=True)
|
|
194
|
+
return final_path
|
|
195
|
+
|
|
196
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
197
|
+
final_path = self.get_destination_path(file_data=file_data)
|
|
198
|
+
with final_path.open("w") as f:
|
|
199
|
+
json.dump(data, f)
|
|
200
|
+
|
|
201
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
202
|
+
final_path = self.get_destination_path(file_data=file_data)
|
|
203
|
+
logger.debug(f"copying file from {path} to {final_path}")
|
|
204
|
+
shutil.copy(src=str(path), dst=str(final_path))
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
local_source_entry = SourceRegistryEntry(
|
|
208
|
+
indexer=LocalIndexer,
|
|
209
|
+
indexer_config=LocalIndexerConfig,
|
|
210
|
+
downloader=LocalDownloader,
|
|
211
|
+
downloader_config=LocalDownloaderConfig,
|
|
212
|
+
connection_config=LocalConnectionConfig,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
local_destination_entry = DestinationRegistryEntry(
|
|
216
|
+
uploader=LocalUploader, uploader_config=LocalUploaderConfig
|
|
217
|
+
)
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
5
|
+
|
|
6
|
+
from dateutil import parser
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
10
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
11
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
13
|
+
from unstructured_ingest.v2.interfaces import (
|
|
14
|
+
AccessConfig,
|
|
15
|
+
ConnectionConfig,
|
|
16
|
+
FileData,
|
|
17
|
+
Uploader,
|
|
18
|
+
UploaderConfig,
|
|
19
|
+
UploadStager,
|
|
20
|
+
UploadStagerConfig,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.v2.logger import logger
|
|
23
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
24
|
+
DestinationRegistryEntry,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from pymilvus import MilvusClient
|
|
29
|
+
|
|
30
|
+
CONNECTOR_TYPE = "milvus"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MilvusAccessConfig(AccessConfig):
|
|
34
|
+
password: Optional[str] = Field(default=None, description="Milvus password")
|
|
35
|
+
token: Optional[str] = Field(default=None, description="Milvus access token")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class MilvusConnectionConfig(ConnectionConfig):
|
|
39
|
+
access_config: Secret[MilvusAccessConfig] = Field(
|
|
40
|
+
default=MilvusAccessConfig(), validate_default=True
|
|
41
|
+
)
|
|
42
|
+
uri: Optional[str] = Field(
|
|
43
|
+
default=None, description="Milvus uri", examples=["http://localhost:19530"]
|
|
44
|
+
)
|
|
45
|
+
user: Optional[str] = Field(default=None, description="Milvus user")
|
|
46
|
+
db_name: Optional[str] = Field(default=None, description="Milvus database name")
|
|
47
|
+
|
|
48
|
+
def get_connection_kwargs(self) -> dict[str, Any]:
|
|
49
|
+
access_config = self.access_config.get_secret_value()
|
|
50
|
+
access_config_dict = access_config.model_dump()
|
|
51
|
+
connection_config_dict = self.model_dump()
|
|
52
|
+
connection_config_dict.pop("access_config", None)
|
|
53
|
+
connection_config_dict.update(access_config_dict)
|
|
54
|
+
# Drop any that were not set explicitly
|
|
55
|
+
connection_config_dict = {k: v for k, v in connection_config_dict.items() if v is not None}
|
|
56
|
+
return connection_config_dict
|
|
57
|
+
|
|
58
|
+
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
59
|
+
@contextmanager
|
|
60
|
+
def get_client(self) -> Generator["MilvusClient", None, None]:
|
|
61
|
+
from pymilvus import MilvusClient
|
|
62
|
+
|
|
63
|
+
client = None
|
|
64
|
+
try:
|
|
65
|
+
client = MilvusClient(**self.get_connection_kwargs())
|
|
66
|
+
yield client
|
|
67
|
+
finally:
|
|
68
|
+
if client:
|
|
69
|
+
client.close()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
73
|
+
fields_to_include: Optional[list[str]] = None
|
|
74
|
+
"""If set - list of fields to include in the output.
|
|
75
|
+
Unspecified fields are removed from the elements.
|
|
76
|
+
This action takes place after metadata flattening.
|
|
77
|
+
Missing fields will cause stager to throw KeyError."""
|
|
78
|
+
|
|
79
|
+
flatten_metadata: bool = True
|
|
80
|
+
"""If set - flatten "metadata" key and put contents directly into data"""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class MilvusUploadStager(UploadStager):
|
|
85
|
+
upload_stager_config: MilvusUploadStagerConfig = field(
|
|
86
|
+
default_factory=lambda: MilvusUploadStagerConfig()
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def parse_date_string(date_string: str) -> float:
|
|
91
|
+
try:
|
|
92
|
+
timestamp = float(date_string)
|
|
93
|
+
return timestamp
|
|
94
|
+
except ValueError:
|
|
95
|
+
pass
|
|
96
|
+
return parser.parse(date_string).timestamp()
|
|
97
|
+
|
|
98
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
99
|
+
working_data = element_dict.copy()
|
|
100
|
+
if self.upload_stager_config.flatten_metadata and (
|
|
101
|
+
metadata := working_data.pop("metadata", None)
|
|
102
|
+
):
|
|
103
|
+
working_data.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
|
|
104
|
+
|
|
105
|
+
# TODO: milvus sdk doesn't seem to support defaults via the schema yet,
|
|
106
|
+
# remove once that gets updated
|
|
107
|
+
defaults = {"is_continuation": False}
|
|
108
|
+
for default in defaults:
|
|
109
|
+
if default not in working_data:
|
|
110
|
+
working_data[default] = defaults[default]
|
|
111
|
+
|
|
112
|
+
if self.upload_stager_config.fields_to_include:
|
|
113
|
+
data_keys = set(working_data.keys())
|
|
114
|
+
for data_key in data_keys:
|
|
115
|
+
if data_key not in self.upload_stager_config.fields_to_include:
|
|
116
|
+
working_data.pop(data_key)
|
|
117
|
+
for field_include_key in self.upload_stager_config.fields_to_include:
|
|
118
|
+
if field_include_key not in working_data:
|
|
119
|
+
raise KeyError(f"Field '{field_include_key}' is missing in data!")
|
|
120
|
+
|
|
121
|
+
datetime_columns = [
|
|
122
|
+
"data_source_date_created",
|
|
123
|
+
"data_source_date_modified",
|
|
124
|
+
"data_source_date_processed",
|
|
125
|
+
"last_modified",
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
json_dumps_fields = ["languages", "data_source_permissions_data"]
|
|
129
|
+
|
|
130
|
+
for datetime_column in datetime_columns:
|
|
131
|
+
if datetime_column in working_data:
|
|
132
|
+
working_data[datetime_column] = self.parse_date_string(
|
|
133
|
+
working_data[datetime_column]
|
|
134
|
+
)
|
|
135
|
+
for json_dumps_field in json_dumps_fields:
|
|
136
|
+
if json_dumps_field in working_data:
|
|
137
|
+
working_data[json_dumps_field] = json.dumps(working_data[json_dumps_field])
|
|
138
|
+
working_data[RECORD_ID_LABEL] = file_data.identifier
|
|
139
|
+
return working_data
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class MilvusUploaderConfig(UploaderConfig):
|
|
143
|
+
db_name: Optional[str] = Field(default=None, description="Milvus database name")
|
|
144
|
+
collection_name: str = Field(description="Milvus collections to write to")
|
|
145
|
+
record_id_key: str = Field(
|
|
146
|
+
default=RECORD_ID_LABEL,
|
|
147
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@dataclass
|
|
152
|
+
class MilvusUploader(Uploader):
|
|
153
|
+
connection_config: MilvusConnectionConfig
|
|
154
|
+
upload_config: MilvusUploaderConfig
|
|
155
|
+
connector_type: str = CONNECTOR_TYPE
|
|
156
|
+
|
|
157
|
+
@DestinationConnectionError.wrap
|
|
158
|
+
def precheck(self):
|
|
159
|
+
from pymilvus import MilvusException
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
with self.get_client() as client:
|
|
163
|
+
if not client.has_collection(self.upload_config.collection_name):
|
|
164
|
+
raise DestinationConnectionError(
|
|
165
|
+
f"Collection '{self.upload_config.collection_name}' does not exist"
|
|
166
|
+
)
|
|
167
|
+
except MilvusException as milvus_exception:
|
|
168
|
+
raise DestinationConnectionError(
|
|
169
|
+
f"failed to precheck Milvus: {str(milvus_exception.message)}"
|
|
170
|
+
) from milvus_exception
|
|
171
|
+
|
|
172
|
+
@contextmanager
|
|
173
|
+
def get_client(self) -> Generator["MilvusClient", None, None]:
|
|
174
|
+
with self.connection_config.get_client() as client:
|
|
175
|
+
if db_name := self.upload_config.db_name:
|
|
176
|
+
client.using_database(db_name=db_name)
|
|
177
|
+
yield client
|
|
178
|
+
|
|
179
|
+
def delete_by_record_id(self, file_data: FileData) -> None:
|
|
180
|
+
logger.info(
|
|
181
|
+
f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
182
|
+
f"from milvus collection {self.upload_config.collection_name}"
|
|
183
|
+
)
|
|
184
|
+
with self.get_client() as client:
|
|
185
|
+
delete_filter = f'{self.upload_config.record_id_key} == "{file_data.identifier}"'
|
|
186
|
+
resp = client.delete(
|
|
187
|
+
collection_name=self.upload_config.collection_name, filter=delete_filter
|
|
188
|
+
)
|
|
189
|
+
logger.info(
|
|
190
|
+
"deleted {} records from milvus collection {}".format(
|
|
191
|
+
resp["delete_count"], self.upload_config.collection_name
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
196
|
+
def insert_results(self, data: Union[dict, list[dict]]):
|
|
197
|
+
from pymilvus import MilvusException
|
|
198
|
+
|
|
199
|
+
logger.info(
|
|
200
|
+
f"uploading {len(data)} entries to {self.connection_config.db_name} "
|
|
201
|
+
f"db in collection {self.upload_config.collection_name}"
|
|
202
|
+
)
|
|
203
|
+
with self.get_client() as client:
|
|
204
|
+
try:
|
|
205
|
+
res = client.insert(collection_name=self.upload_config.collection_name, data=data)
|
|
206
|
+
except MilvusException as milvus_exception:
|
|
207
|
+
raise WriteError(
|
|
208
|
+
f"failed to upload records to Milvus: {str(milvus_exception.message)}"
|
|
209
|
+
) from milvus_exception
|
|
210
|
+
if "err_count" in res and isinstance(res["err_count"], int) and res["err_count"] > 0:
|
|
211
|
+
err_count = res["err_count"]
|
|
212
|
+
raise WriteError(f"failed to upload {err_count} docs")
|
|
213
|
+
|
|
214
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
215
|
+
self.delete_by_record_id(file_data=file_data)
|
|
216
|
+
self.insert_results(data=data)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
milvus_destination_entry = DestinationRegistryEntry(
|
|
220
|
+
connection_config=MilvusConnectionConfig,
|
|
221
|
+
uploader=MilvusUploader,
|
|
222
|
+
uploader_config=MilvusUploaderConfig,
|
|
223
|
+
upload_stager=MilvusUploadStager,
|
|
224
|
+
upload_stager_config=MilvusUploadStagerConfig,
|
|
225
|
+
)
|