unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import hashlib
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from time import time
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest import __name__ as integration_name
|
|
11
|
+
from unstructured_ingest.__version__ import __version__ as integration_version
|
|
12
|
+
from unstructured_ingest.error import (
|
|
13
|
+
DestinationConnectionError,
|
|
14
|
+
SourceConnectionError,
|
|
15
|
+
SourceConnectionNetworkError,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.utils.data_prep import batch_generator, get_data
|
|
18
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
19
|
+
from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
|
|
20
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
21
|
+
from unstructured_ingest.v2.interfaces import (
|
|
22
|
+
AccessConfig,
|
|
23
|
+
BatchFileData,
|
|
24
|
+
BatchItem,
|
|
25
|
+
ConnectionConfig,
|
|
26
|
+
Downloader,
|
|
27
|
+
DownloaderConfig,
|
|
28
|
+
DownloadResponse,
|
|
29
|
+
FileData,
|
|
30
|
+
FileDataSourceMetadata,
|
|
31
|
+
Indexer,
|
|
32
|
+
IndexerConfig,
|
|
33
|
+
SourceIdentifiers,
|
|
34
|
+
Uploader,
|
|
35
|
+
UploaderConfig,
|
|
36
|
+
UploadStager,
|
|
37
|
+
UploadStagerConfig,
|
|
38
|
+
download_responses,
|
|
39
|
+
)
|
|
40
|
+
from unstructured_ingest.v2.logger import logger
|
|
41
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
42
|
+
DestinationRegistryEntry,
|
|
43
|
+
SourceRegistryEntry,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if TYPE_CHECKING:
|
|
47
|
+
from astrapy import AsyncCollection as AstraDBAsyncCollection
|
|
48
|
+
from astrapy import Collection as AstraDBCollection
|
|
49
|
+
from astrapy import DataAPIClient as AstraDBClient
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
CONNECTOR_TYPE = "astradb"
|
|
53
|
+
|
|
54
|
+
MAX_CONTENT_PARAM_BYTE_SIZE = 8000
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class AstraDBAdditionalMetadata(BaseModel):
|
|
58
|
+
collection_name: str
|
|
59
|
+
keyspace: Optional[str] = None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AstraDBBatchFileData(BatchFileData):
|
|
63
|
+
additional_metadata: AstraDBAdditionalMetadata
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class AstraDBAccessConfig(AccessConfig):
|
|
67
|
+
token: str = Field(description="Astra DB Token with access to the database.")
|
|
68
|
+
api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class AstraDBConnectionConfig(ConnectionConfig):
|
|
72
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
73
|
+
access_config: Secret[AstraDBAccessConfig]
|
|
74
|
+
|
|
75
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
76
|
+
def get_client(self) -> "AstraDBClient":
|
|
77
|
+
from astrapy import DataAPIClient as AstraDBClient
|
|
78
|
+
|
|
79
|
+
# Create a client object to interact with the Astra DB
|
|
80
|
+
# caller_name/version for Astra DB tracking
|
|
81
|
+
return AstraDBClient(
|
|
82
|
+
caller_name=integration_name,
|
|
83
|
+
caller_version=integration_version,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_astra_collection(
|
|
88
|
+
connection_config: AstraDBConnectionConfig,
|
|
89
|
+
collection_name: str,
|
|
90
|
+
keyspace: str,
|
|
91
|
+
) -> "AstraDBCollection":
|
|
92
|
+
# Build the Astra DB object.
|
|
93
|
+
access_configs = connection_config.access_config.get_secret_value()
|
|
94
|
+
|
|
95
|
+
# Create a client object to interact with the Astra DB
|
|
96
|
+
# caller_name/version for Astra DB tracking
|
|
97
|
+
client = connection_config.get_client()
|
|
98
|
+
|
|
99
|
+
# Get the database object
|
|
100
|
+
astra_db = client.get_database(
|
|
101
|
+
api_endpoint=access_configs.api_endpoint,
|
|
102
|
+
token=access_configs.token,
|
|
103
|
+
keyspace=keyspace,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Connect to the collection
|
|
107
|
+
astra_db_collection = astra_db.get_collection(name=collection_name)
|
|
108
|
+
return astra_db_collection
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
async def get_async_astra_collection(
|
|
112
|
+
connection_config: AstraDBConnectionConfig,
|
|
113
|
+
collection_name: str,
|
|
114
|
+
keyspace: str,
|
|
115
|
+
) -> "AstraDBAsyncCollection":
|
|
116
|
+
# Build the Astra DB object.
|
|
117
|
+
access_configs = connection_config.access_config.get_secret_value()
|
|
118
|
+
|
|
119
|
+
# Create a client object to interact with the Astra DB
|
|
120
|
+
client = connection_config.get_client()
|
|
121
|
+
|
|
122
|
+
# Get the async database object
|
|
123
|
+
async_astra_db = client.get_async_database(
|
|
124
|
+
api_endpoint=access_configs.api_endpoint,
|
|
125
|
+
token=access_configs.token,
|
|
126
|
+
keyspace=keyspace,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Get async collection from AsyncDatabase
|
|
130
|
+
async_astra_db_collection = await async_astra_db.get_collection(name=collection_name)
|
|
131
|
+
return async_astra_db_collection
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class AstraDBUploadStagerConfig(UploadStagerConfig):
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class AstraDBIndexerConfig(IndexerConfig):
|
|
139
|
+
collection_name: str = Field(
|
|
140
|
+
description="The name of the Astra DB collection. "
|
|
141
|
+
"Note that the collection name must only include letters, "
|
|
142
|
+
"numbers, and underscores."
|
|
143
|
+
)
|
|
144
|
+
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
145
|
+
batch_size: int = Field(default=20, description="Number of records per batch")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class AstraDBDownloaderConfig(DownloaderConfig):
|
|
149
|
+
fields: list[str] = field(default_factory=list)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class AstraDBUploaderConfig(UploaderConfig):
|
|
153
|
+
collection_name: str = Field(
|
|
154
|
+
description="The name of the Astra DB collection. "
|
|
155
|
+
"Note that the collection name must only include letters, "
|
|
156
|
+
"numbers, and underscores."
|
|
157
|
+
)
|
|
158
|
+
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
159
|
+
requested_indexing_policy: Optional[dict[str, Any]] = Field(
|
|
160
|
+
default=None,
|
|
161
|
+
description="The indexing policy to use for the collection.",
|
|
162
|
+
examples=['{"deny": ["metadata"]}'],
|
|
163
|
+
)
|
|
164
|
+
batch_size: int = Field(default=20, description="Number of records per batch")
|
|
165
|
+
record_id_key: str = Field(
|
|
166
|
+
default=RECORD_ID_LABEL,
|
|
167
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@dataclass
|
|
172
|
+
class AstraDBIndexer(Indexer):
|
|
173
|
+
connection_config: AstraDBConnectionConfig
|
|
174
|
+
index_config: AstraDBIndexerConfig
|
|
175
|
+
|
|
176
|
+
def get_collection(self) -> "AstraDBCollection":
|
|
177
|
+
return get_astra_collection(
|
|
178
|
+
connection_config=self.connection_config,
|
|
179
|
+
collection_name=self.index_config.collection_name,
|
|
180
|
+
keyspace=self.index_config.keyspace,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def precheck(self) -> None:
|
|
184
|
+
try:
|
|
185
|
+
self.get_collection().options()
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
188
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
189
|
+
|
|
190
|
+
def _get_doc_ids(self) -> set[str]:
|
|
191
|
+
"""Fetches all document ids in an index"""
|
|
192
|
+
# Get the collection
|
|
193
|
+
collection = self.get_collection()
|
|
194
|
+
|
|
195
|
+
# Perform the find operation to get all items
|
|
196
|
+
astra_db_docs_cursor = collection.find({}, projection={"_id": True})
|
|
197
|
+
|
|
198
|
+
# Iterate over the cursor
|
|
199
|
+
astra_db_docs = []
|
|
200
|
+
for result in astra_db_docs_cursor:
|
|
201
|
+
astra_db_docs.append(result)
|
|
202
|
+
|
|
203
|
+
# Create file data for each astra record
|
|
204
|
+
ids = sorted([astra_record["_id"] for astra_record in astra_db_docs])
|
|
205
|
+
|
|
206
|
+
return set(ids)
|
|
207
|
+
|
|
208
|
+
def run(self, **kwargs: Any) -> Generator[AstraDBBatchFileData, None, None]:
|
|
209
|
+
all_ids = self._get_doc_ids()
|
|
210
|
+
ids = list(all_ids)
|
|
211
|
+
id_batches = batch_generator(ids, self.index_config.batch_size)
|
|
212
|
+
|
|
213
|
+
for batch in id_batches:
|
|
214
|
+
fd = AstraDBBatchFileData(
|
|
215
|
+
connector_type=CONNECTOR_TYPE,
|
|
216
|
+
metadata=FileDataSourceMetadata(
|
|
217
|
+
date_processed=str(time()),
|
|
218
|
+
),
|
|
219
|
+
additional_metadata=AstraDBAdditionalMetadata(
|
|
220
|
+
collection_name=self.index_config.collection_name,
|
|
221
|
+
keyspace=self.index_config.keyspace,
|
|
222
|
+
),
|
|
223
|
+
batch_items=[BatchItem(identifier=b) for b in batch],
|
|
224
|
+
)
|
|
225
|
+
yield fd
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@dataclass
|
|
229
|
+
class AstraDBDownloader(Downloader):
|
|
230
|
+
connection_config: AstraDBConnectionConfig
|
|
231
|
+
download_config: AstraDBDownloaderConfig
|
|
232
|
+
connector_type: str = CONNECTOR_TYPE
|
|
233
|
+
|
|
234
|
+
def is_async(self) -> bool:
|
|
235
|
+
return True
|
|
236
|
+
|
|
237
|
+
def get_identifier(self, record_id: str) -> str:
|
|
238
|
+
f = f"{record_id}"
|
|
239
|
+
if self.download_config.fields:
|
|
240
|
+
f = "{}-{}".format(
|
|
241
|
+
f,
|
|
242
|
+
hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
|
|
243
|
+
)
|
|
244
|
+
return f
|
|
245
|
+
|
|
246
|
+
def write_astra_result_to_csv(self, astra_result: dict, download_path: str) -> None:
|
|
247
|
+
with open(download_path, "w", encoding="utf8") as f:
|
|
248
|
+
writer = csv.writer(f)
|
|
249
|
+
writer.writerow(astra_result.keys())
|
|
250
|
+
writer.writerow(astra_result.values())
|
|
251
|
+
|
|
252
|
+
def generate_download_response(
|
|
253
|
+
self, result: dict, file_data: AstraDBBatchFileData
|
|
254
|
+
) -> DownloadResponse:
|
|
255
|
+
record_id = result["_id"]
|
|
256
|
+
filename_id = self.get_identifier(record_id=record_id)
|
|
257
|
+
filename = f"{filename_id}.csv" # csv to preserve column info
|
|
258
|
+
download_path = self.download_dir / Path(filename)
|
|
259
|
+
logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
|
|
260
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
261
|
+
try:
|
|
262
|
+
self.write_astra_result_to_csv(astra_result=result, download_path=str(download_path))
|
|
263
|
+
except Exception as e:
|
|
264
|
+
logger.error(
|
|
265
|
+
f"failed to download from record {record_id} to {download_path}: {e}",
|
|
266
|
+
exc_info=True,
|
|
267
|
+
)
|
|
268
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
269
|
+
|
|
270
|
+
# modify input file_data for download_response
|
|
271
|
+
file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
|
|
272
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
273
|
+
cast_file_data.identifier = filename
|
|
274
|
+
cast_file_data.metadata.date_processed = str(time())
|
|
275
|
+
cast_file_data.metadata.record_locator = {"document_id": record_id}
|
|
276
|
+
return super().generate_download_response(
|
|
277
|
+
file_data=cast_file_data, download_path=download_path
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
281
|
+
raise NotImplementedError("Use astradb run_async instead")
|
|
282
|
+
|
|
283
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
284
|
+
# Get metadata from file_data
|
|
285
|
+
astra_file_data = AstraDBBatchFileData.cast(file_data=file_data)
|
|
286
|
+
ids: list[str] = [item.identifier for item in astra_file_data.batch_items]
|
|
287
|
+
collection_name: str = astra_file_data.additional_metadata.collection_name
|
|
288
|
+
keyspace: str = astra_file_data.additional_metadata.keyspace
|
|
289
|
+
|
|
290
|
+
# Retrieve results from async collection
|
|
291
|
+
download_responses = []
|
|
292
|
+
async_astra_collection = await get_async_astra_collection(
|
|
293
|
+
connection_config=self.connection_config,
|
|
294
|
+
collection_name=collection_name,
|
|
295
|
+
keyspace=keyspace,
|
|
296
|
+
)
|
|
297
|
+
async for result in async_astra_collection.find({"_id": {"$in": ids}}):
|
|
298
|
+
download_responses.append(
|
|
299
|
+
self.generate_download_response(result=result, file_data=astra_file_data)
|
|
300
|
+
)
|
|
301
|
+
return download_responses
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@dataclass
|
|
305
|
+
class AstraDBUploadStager(UploadStager):
|
|
306
|
+
upload_stager_config: AstraDBUploadStagerConfig = field(
|
|
307
|
+
default_factory=lambda: AstraDBUploadStagerConfig()
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
def truncate_dict_elements(self, element_dict: dict) -> None:
|
|
311
|
+
text = element_dict.pop("text", None)
|
|
312
|
+
if text is not None:
|
|
313
|
+
element_dict["text"] = truncate_string_bytes(text, MAX_CONTENT_PARAM_BYTE_SIZE)
|
|
314
|
+
metadata = element_dict.get("metadata")
|
|
315
|
+
if metadata is not None and isinstance(metadata, dict):
|
|
316
|
+
text_as_html = element_dict["metadata"].pop("text_as_html", None)
|
|
317
|
+
if text_as_html is not None:
|
|
318
|
+
element_dict["metadata"]["text_as_html"] = truncate_string_bytes(
|
|
319
|
+
text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
323
|
+
self.truncate_dict_elements(element_dict)
|
|
324
|
+
return {
|
|
325
|
+
"$vector": element_dict.pop("embeddings", None),
|
|
326
|
+
"content": element_dict.pop("text", None),
|
|
327
|
+
RECORD_ID_LABEL: file_data.identifier,
|
|
328
|
+
"metadata": element_dict,
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@dataclass
|
|
333
|
+
class AstraDBUploader(Uploader):
|
|
334
|
+
connection_config: AstraDBConnectionConfig
|
|
335
|
+
upload_config: AstraDBUploaderConfig
|
|
336
|
+
connector_type: str = CONNECTOR_TYPE
|
|
337
|
+
|
|
338
|
+
def precheck(self) -> None:
|
|
339
|
+
try:
|
|
340
|
+
get_astra_collection(
|
|
341
|
+
connection_config=self.connection_config,
|
|
342
|
+
collection_name=self.upload_config.collection_name,
|
|
343
|
+
keyspace=self.upload_config.keyspace,
|
|
344
|
+
).options()
|
|
345
|
+
except Exception as e:
|
|
346
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
347
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
348
|
+
|
|
349
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
350
|
+
def get_collection(self) -> "AstraDBCollection":
|
|
351
|
+
return get_astra_collection(
|
|
352
|
+
connection_config=self.connection_config,
|
|
353
|
+
collection_name=self.upload_config.collection_name,
|
|
354
|
+
keyspace=self.upload_config.keyspace,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
|
|
358
|
+
logger.debug(
|
|
359
|
+
f"deleting records from collection {collection.name} "
|
|
360
|
+
f"with {self.upload_config.record_id_key} "
|
|
361
|
+
f"set to {file_data.identifier}"
|
|
362
|
+
)
|
|
363
|
+
delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
|
|
364
|
+
delete_resp = collection.delete_many(filter=delete_filter)
|
|
365
|
+
logger.debug(
|
|
366
|
+
f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
370
|
+
logger.info(
|
|
371
|
+
f"writing {len(data)} objects to destination "
|
|
372
|
+
f"collection {self.upload_config.collection_name}"
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
astra_db_batch_size = self.upload_config.batch_size
|
|
376
|
+
collection = self.get_collection()
|
|
377
|
+
|
|
378
|
+
self.delete_by_record_id(collection=collection, file_data=file_data)
|
|
379
|
+
|
|
380
|
+
for chunk in batch_generator(data, astra_db_batch_size):
|
|
381
|
+
collection.insert_many(chunk)
|
|
382
|
+
|
|
383
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
384
|
+
data = get_data(path=path)
|
|
385
|
+
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
astra_db_source_entry = SourceRegistryEntry(
|
|
389
|
+
indexer=AstraDBIndexer,
|
|
390
|
+
indexer_config=AstraDBIndexerConfig,
|
|
391
|
+
downloader=AstraDBDownloader,
|
|
392
|
+
downloader_config=AstraDBDownloaderConfig,
|
|
393
|
+
connection_config=AstraDBConnectionConfig,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
astra_db_destination_entry = DestinationRegistryEntry(
|
|
397
|
+
connection_config=AstraDBConnectionConfig,
|
|
398
|
+
upload_stager_config=AstraDBUploadStagerConfig,
|
|
399
|
+
upload_stager=AstraDBUploadStager,
|
|
400
|
+
uploader_config=AstraDBUploaderConfig,
|
|
401
|
+
uploader=AstraDBUploader,
|
|
402
|
+
)
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, Secret
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
9
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
12
|
+
from unstructured_ingest.v2.interfaces import (
|
|
13
|
+
AccessConfig,
|
|
14
|
+
ConnectionConfig,
|
|
15
|
+
FileData,
|
|
16
|
+
Uploader,
|
|
17
|
+
UploaderConfig,
|
|
18
|
+
UploadStager,
|
|
19
|
+
UploadStagerConfig,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
|
+
DestinationRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
|
|
26
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from azure.search.documents import SearchClient
|
|
30
|
+
from azure.search.documents.indexes import SearchIndexClient
|
|
31
|
+
|
|
32
|
+
CONNECTOR_TYPE = "azure_ai_search"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AzureAISearchAccessConfig(AccessConfig):
|
|
36
|
+
azure_ai_search_key: str = Field(
|
|
37
|
+
alias="key", description="Credential that is used for authenticating to an Azure service"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class AzureAISearchConnectionConfig(ConnectionConfig):
|
|
42
|
+
endpoint: str = Field(
|
|
43
|
+
description="The URL endpoint of an Azure AI (Cognitive) search service. "
|
|
44
|
+
"In the form of https://{{service_name}}.search.windows.net"
|
|
45
|
+
)
|
|
46
|
+
index: str = Field(
|
|
47
|
+
description="The name of the Azure AI (Cognitive) Search index to connect to."
|
|
48
|
+
)
|
|
49
|
+
access_config: Secret[AzureAISearchAccessConfig]
|
|
50
|
+
|
|
51
|
+
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
52
|
+
@contextmanager
|
|
53
|
+
def get_search_client(self) -> Generator["SearchClient", None, None]:
|
|
54
|
+
from azure.core.credentials import AzureKeyCredential
|
|
55
|
+
from azure.search.documents import SearchClient
|
|
56
|
+
|
|
57
|
+
with SearchClient(
|
|
58
|
+
endpoint=self.endpoint,
|
|
59
|
+
index_name=self.index,
|
|
60
|
+
credential=AzureKeyCredential(
|
|
61
|
+
self.access_config.get_secret_value().azure_ai_search_key
|
|
62
|
+
),
|
|
63
|
+
) as client:
|
|
64
|
+
yield client
|
|
65
|
+
|
|
66
|
+
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
67
|
+
@contextmanager
|
|
68
|
+
def get_search_index_client(self) -> Generator["SearchIndexClient", None, None]:
|
|
69
|
+
from azure.core.credentials import AzureKeyCredential
|
|
70
|
+
from azure.search.documents.indexes import SearchIndexClient
|
|
71
|
+
|
|
72
|
+
with SearchIndexClient(
|
|
73
|
+
endpoint=self.endpoint,
|
|
74
|
+
credential=AzureKeyCredential(
|
|
75
|
+
self.access_config.get_secret_value().azure_ai_search_key
|
|
76
|
+
),
|
|
77
|
+
) as search_index_client:
|
|
78
|
+
yield search_index_client
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class AzureAISearchUploadStagerConfig(UploadStagerConfig):
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class AzureAISearchUploaderConfig(UploaderConfig):
|
|
86
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
87
|
+
record_id_key: str = Field(
|
|
88
|
+
default=RECORD_ID_LABEL,
|
|
89
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass
|
|
94
|
+
class AzureAISearchUploadStager(UploadStager):
|
|
95
|
+
upload_stager_config: AzureAISearchUploadStagerConfig = field(
|
|
96
|
+
default_factory=lambda: AzureAISearchUploadStagerConfig()
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
100
|
+
"""
|
|
101
|
+
updates the dictionary that is from each Element being converted into a dict/json
|
|
102
|
+
into a dictionary that conforms to the schema expected by the
|
|
103
|
+
Azure Cognitive Search index
|
|
104
|
+
"""
|
|
105
|
+
data = element_dict.copy()
|
|
106
|
+
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
107
|
+
data[RECORD_ID_LABEL] = file_data.identifier
|
|
108
|
+
|
|
109
|
+
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
110
|
+
data["metadata"]["coordinates"]["points"] = json.dumps(points)
|
|
111
|
+
if version := data.get("metadata", {}).get("data_source", {}).get("version"):
|
|
112
|
+
data["metadata"]["data_source"]["version"] = str(version)
|
|
113
|
+
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
|
|
114
|
+
data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
|
|
115
|
+
if permissions_data := (
|
|
116
|
+
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
|
117
|
+
):
|
|
118
|
+
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
|
|
119
|
+
if links := data.get("metadata", {}).get("links"):
|
|
120
|
+
data["metadata"]["links"] = [json.dumps(link) for link in links]
|
|
121
|
+
if last_modified := data.get("metadata", {}).get("last_modified"):
|
|
122
|
+
data["metadata"]["last_modified"] = parse_datetime(last_modified).strftime(
|
|
123
|
+
"%Y-%m-%dT%H:%M:%S.%fZ"
|
|
124
|
+
)
|
|
125
|
+
if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
|
|
126
|
+
data["metadata"]["data_source"]["date_created"] = parse_datetime(date_created).strftime(
|
|
127
|
+
"%Y-%m-%dT%H:%M:%S.%fZ"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
|
|
131
|
+
data["metadata"]["data_source"]["date_modified"] = parse_datetime(
|
|
132
|
+
date_modified
|
|
133
|
+
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
134
|
+
|
|
135
|
+
if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
|
|
136
|
+
data["metadata"]["data_source"]["date_processed"] = parse_datetime(
|
|
137
|
+
date_processed
|
|
138
|
+
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
139
|
+
|
|
140
|
+
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
141
|
+
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
|
|
142
|
+
if page_number := data.get("metadata", {}).get("page_number"):
|
|
143
|
+
data["metadata"]["page_number"] = str(page_number)
|
|
144
|
+
return data
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@dataclass
|
|
148
|
+
class AzureAISearchUploader(Uploader):
|
|
149
|
+
upload_config: AzureAISearchUploaderConfig
|
|
150
|
+
connection_config: AzureAISearchConnectionConfig
|
|
151
|
+
connector_type: str = CONNECTOR_TYPE
|
|
152
|
+
|
|
153
|
+
def query_docs(self, record_id: str, index_key: str) -> list[str]:
|
|
154
|
+
with self.connection_config.get_search_client() as search_client:
|
|
155
|
+
results = list(
|
|
156
|
+
search_client.search(filter=f"record_id eq '{record_id}'", select=[index_key])
|
|
157
|
+
)
|
|
158
|
+
return [result[index_key] for result in results]
|
|
159
|
+
|
|
160
|
+
def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
|
|
161
|
+
logger.debug(
|
|
162
|
+
f"deleting any content with metadata "
|
|
163
|
+
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
164
|
+
f"from azure cognitive search index: {self.connection_config.index}"
|
|
165
|
+
)
|
|
166
|
+
doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
|
|
167
|
+
if not doc_ids_to_delete:
|
|
168
|
+
return
|
|
169
|
+
with self.connection_config.get_search_client() as search_client:
|
|
170
|
+
results = search_client.delete_documents(
|
|
171
|
+
documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
|
|
172
|
+
)
|
|
173
|
+
errors = []
|
|
174
|
+
success = []
|
|
175
|
+
for result in results:
|
|
176
|
+
if result.succeeded:
|
|
177
|
+
success.append(result)
|
|
178
|
+
else:
|
|
179
|
+
errors.append(result)
|
|
180
|
+
logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
|
|
181
|
+
if errors:
|
|
182
|
+
raise WriteError(
|
|
183
|
+
", ".join(
|
|
184
|
+
[f"[{error.status_code}] {error.error_message}" for error in errors],
|
|
185
|
+
),
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
@DestinationConnectionError.wrap
|
|
189
|
+
@requires_dependencies(["azure"], extras="azure-ai-search")
|
|
190
|
+
def write_dict(
|
|
191
|
+
self, elements_dict: list[dict[str, Any]], search_client: "SearchClient"
|
|
192
|
+
) -> None:
|
|
193
|
+
import azure.core.exceptions
|
|
194
|
+
|
|
195
|
+
logger.info(
|
|
196
|
+
f"writing {len(elements_dict)} documents to destination "
|
|
197
|
+
f"index at {self.connection_config.index}",
|
|
198
|
+
)
|
|
199
|
+
try:
|
|
200
|
+
results = search_client.upload_documents(documents=elements_dict)
|
|
201
|
+
except azure.core.exceptions.HttpResponseError as http_error:
|
|
202
|
+
raise WriteError(f"http error: {http_error}") from http_error
|
|
203
|
+
|
|
204
|
+
errors = []
|
|
205
|
+
success = []
|
|
206
|
+
for result in results:
|
|
207
|
+
if result.succeeded:
|
|
208
|
+
success.append(result)
|
|
209
|
+
else:
|
|
210
|
+
errors.append(result)
|
|
211
|
+
logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
|
|
212
|
+
if errors:
|
|
213
|
+
raise WriteError(
|
|
214
|
+
", ".join(
|
|
215
|
+
[
|
|
216
|
+
f"{error.key}: " f"[{error.status_code}] {error.error_message}"
|
|
217
|
+
for error in errors
|
|
218
|
+
],
|
|
219
|
+
),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def can_delete(self) -> bool:
|
|
223
|
+
with self.connection_config.get_search_index_client() as search_index_client:
|
|
224
|
+
index = search_index_client.get_index(name=self.connection_config.index)
|
|
225
|
+
index_fields = index.fields
|
|
226
|
+
record_id_fields = [
|
|
227
|
+
field for field in index_fields if field.name == self.upload_config.record_id_key
|
|
228
|
+
]
|
|
229
|
+
if not record_id_fields:
|
|
230
|
+
return False
|
|
231
|
+
record_id_field = record_id_fields[0]
|
|
232
|
+
return record_id_field.filterable
|
|
233
|
+
|
|
234
|
+
def get_index_key(self) -> str:
|
|
235
|
+
with self.connection_config.get_search_index_client() as search_index_client:
|
|
236
|
+
index = search_index_client.get_index(name=self.connection_config.index)
|
|
237
|
+
index_fields = index.fields
|
|
238
|
+
key_fields = [field for field in index_fields if field.key]
|
|
239
|
+
if not key_fields:
|
|
240
|
+
raise ValueError("no key field found in index fields")
|
|
241
|
+
return key_fields[0].name
|
|
242
|
+
|
|
243
|
+
def precheck(self) -> None:
|
|
244
|
+
try:
|
|
245
|
+
with self.connection_config.get_search_client() as search_client:
|
|
246
|
+
search_client.get_document_count()
|
|
247
|
+
except Exception as e:
|
|
248
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
249
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
250
|
+
|
|
251
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
252
|
+
logger.info(
|
|
253
|
+
f"writing document batches to destination"
|
|
254
|
+
f" endpoint at {str(self.connection_config.endpoint)}"
|
|
255
|
+
f" index at {str(self.connection_config.index)}"
|
|
256
|
+
f" with batch size {str(self.upload_config.batch_size)}"
|
|
257
|
+
)
|
|
258
|
+
if self.can_delete():
|
|
259
|
+
index_key = self.get_index_key()
|
|
260
|
+
self.delete_by_record_id(file_data=file_data, index_key=index_key)
|
|
261
|
+
else:
|
|
262
|
+
logger.warning("criteria for deleting previous content not met, skipping")
|
|
263
|
+
|
|
264
|
+
batch_size = self.upload_config.batch_size
|
|
265
|
+
with self.connection_config.get_search_client() as search_client:
|
|
266
|
+
for chunk in batch_generator(data, batch_size):
|
|
267
|
+
self.write_dict(elements_dict=chunk, search_client=search_client) # noqa: E203
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
azure_ai_search_destination_entry = DestinationRegistryEntry(
|
|
271
|
+
connection_config=AzureAISearchConnectionConfig,
|
|
272
|
+
uploader=AzureAISearchUploader,
|
|
273
|
+
uploader_config=AzureAISearchUploaderConfig,
|
|
274
|
+
upload_stager=AzureAISearchUploadStager,
|
|
275
|
+
upload_stager_config=AzureAISearchUploadStagerConfig,
|
|
276
|
+
)
|