unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import date, datetime
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from dateutil import parser
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
14
|
+
from unstructured_ingest.v2.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
FileData,
|
|
18
|
+
Uploader,
|
|
19
|
+
UploaderConfig,
|
|
20
|
+
UploadStager,
|
|
21
|
+
UploadStagerConfig,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.v2.logger import logger
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from weaviate.classes.init import Timeout
|
|
27
|
+
from weaviate.client import WeaviateClient
|
|
28
|
+
from weaviate.collections.batch.client import BatchClient
|
|
29
|
+
|
|
30
|
+
CONNECTOR_TYPE = "weaviate"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class WeaviateAccessConfig(AccessConfig, ABC):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class WeaviateConnectionConfig(ConnectionConfig, ABC):
|
|
38
|
+
init_timeout: int = Field(default=2, ge=0, description="Timeout for initialization checks")
|
|
39
|
+
insert_timeout: int = Field(default=90, ge=0, description="Timeout for insert operations")
|
|
40
|
+
query_timeout: int = Field(default=30, ge=0, description="Timeout for query operations")
|
|
41
|
+
access_config: Secret[WeaviateAccessConfig] = Field(
|
|
42
|
+
default=WeaviateAccessConfig(), validate_default=True
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
46
|
+
def get_timeout(self) -> "Timeout":
|
|
47
|
+
from weaviate.classes.init import Timeout
|
|
48
|
+
|
|
49
|
+
return Timeout(init=self.init_timeout, query=self.query_timeout, insert=self.insert_timeout)
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
@contextmanager
|
|
53
|
+
def get_client(self) -> Generator["WeaviateClient", None, None]:
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class WeaviateUploadStagerConfig(UploadStagerConfig):
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class WeaviateUploadStager(UploadStager):
|
|
63
|
+
upload_stager_config: WeaviateUploadStagerConfig = field(
|
|
64
|
+
default_factory=lambda: WeaviateUploadStagerConfig()
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def parse_date_string(date_string: str) -> date:
|
|
69
|
+
try:
|
|
70
|
+
timestamp = float(date_string)
|
|
71
|
+
return datetime.fromtimestamp(timestamp)
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.debug(f"date {date_string} string not a timestamp: {e}")
|
|
74
|
+
return parser.parse(date_string)
|
|
75
|
+
|
|
76
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
77
|
+
"""
|
|
78
|
+
Updates the element dictionary to conform to the Weaviate schema
|
|
79
|
+
"""
|
|
80
|
+
data = element_dict.copy()
|
|
81
|
+
working_data = data.copy()
|
|
82
|
+
# Dict as string formatting
|
|
83
|
+
if (
|
|
84
|
+
record_locator := working_data.get("metadata", {})
|
|
85
|
+
.get("data_source", {})
|
|
86
|
+
.get("record_locator")
|
|
87
|
+
):
|
|
88
|
+
# Explicit casting otherwise fails schema type checking
|
|
89
|
+
working_data["metadata"]["data_source"]["record_locator"] = str(
|
|
90
|
+
json.dumps(record_locator)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Array of items as string formatting
|
|
94
|
+
if points := working_data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
95
|
+
working_data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
|
|
96
|
+
|
|
97
|
+
if links := working_data.get("metadata", {}).get("links", {}):
|
|
98
|
+
working_data["metadata"]["links"] = str(json.dumps(links))
|
|
99
|
+
|
|
100
|
+
if permissions_data := (
|
|
101
|
+
working_data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
|
102
|
+
):
|
|
103
|
+
working_data["metadata"]["data_source"]["permissions_data"] = json.dumps(
|
|
104
|
+
permissions_data
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Datetime formatting
|
|
108
|
+
if (
|
|
109
|
+
date_created := working_data.get("metadata", {})
|
|
110
|
+
.get("data_source", {})
|
|
111
|
+
.get("date_created")
|
|
112
|
+
):
|
|
113
|
+
working_data["metadata"]["data_source"]["date_created"] = self.parse_date_string(
|
|
114
|
+
date_created
|
|
115
|
+
).strftime(
|
|
116
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if (
|
|
120
|
+
date_modified := working_data.get("metadata", {})
|
|
121
|
+
.get("data_source", {})
|
|
122
|
+
.get("date_modified")
|
|
123
|
+
):
|
|
124
|
+
working_data["metadata"]["data_source"]["date_modified"] = self.parse_date_string(
|
|
125
|
+
date_modified
|
|
126
|
+
).strftime(
|
|
127
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if (
|
|
131
|
+
date_processed := working_data.get("metadata", {})
|
|
132
|
+
.get("data_source", {})
|
|
133
|
+
.get("date_processed")
|
|
134
|
+
):
|
|
135
|
+
working_data["metadata"]["data_source"]["date_processed"] = self.parse_date_string(
|
|
136
|
+
date_processed
|
|
137
|
+
).strftime(
|
|
138
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if last_modified := working_data.get("metadata", {}).get("last_modified"):
|
|
142
|
+
working_data["metadata"]["last_modified"] = self.parse_date_string(
|
|
143
|
+
last_modified
|
|
144
|
+
).strftime(
|
|
145
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# String casting
|
|
149
|
+
if version := working_data.get("metadata", {}).get("data_source", {}).get("version"):
|
|
150
|
+
working_data["metadata"]["data_source"]["version"] = str(version)
|
|
151
|
+
|
|
152
|
+
if page_number := working_data.get("metadata", {}).get("page_number"):
|
|
153
|
+
working_data["metadata"]["page_number"] = str(page_number)
|
|
154
|
+
|
|
155
|
+
if regex_metadata := working_data.get("metadata", {}).get("regex_metadata"):
|
|
156
|
+
working_data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
|
157
|
+
|
|
158
|
+
working_data[RECORD_ID_LABEL] = file_data.identifier
|
|
159
|
+
return working_data
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class WeaviateUploaderConfig(UploaderConfig):
|
|
163
|
+
collection: str = Field(description="The name of the collection this object belongs to")
|
|
164
|
+
batch_size: Optional[int] = Field(default=None, description="Number of records per batch")
|
|
165
|
+
requests_per_minute: Optional[int] = Field(default=None, description="Rate limit for upload")
|
|
166
|
+
dynamic_batch: bool = Field(default=True, description="Whether to use dynamic batch")
|
|
167
|
+
record_id_key: str = Field(
|
|
168
|
+
default=RECORD_ID_LABEL,
|
|
169
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def model_post_init(self, __context: Any) -> None:
|
|
173
|
+
batch_types = {
|
|
174
|
+
"fixed_size": self.batch_size is not None,
|
|
175
|
+
"rate_limited": self.requests_per_minute is not None,
|
|
176
|
+
"dynamic": self.dynamic_batch,
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
enabled_batch_modes = [batch_key for batch_key, flag in batch_types.items() if flag]
|
|
180
|
+
if not enabled_batch_modes:
|
|
181
|
+
raise ValueError("No batch mode enabled")
|
|
182
|
+
if len(enabled_batch_modes) > 1:
|
|
183
|
+
raise ValueError(
|
|
184
|
+
"Multiple batch modes enabled, only one mode can be used: {}".format(
|
|
185
|
+
", ".join(enabled_batch_modes)
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
logger.info(f"Uploader config instantiated with {enabled_batch_modes[0]} batch mode")
|
|
189
|
+
|
|
190
|
+
@contextmanager
|
|
191
|
+
def get_batch_client(self, client: "WeaviateClient") -> Generator["BatchClient", None, None]:
|
|
192
|
+
if self.dynamic_batch:
|
|
193
|
+
with client.batch.dynamic() as batch_client:
|
|
194
|
+
yield batch_client
|
|
195
|
+
elif self.batch_size:
|
|
196
|
+
with client.batch.fixed_size(batch_size=self.batch_size) as batch_client:
|
|
197
|
+
yield batch_client
|
|
198
|
+
elif self.requests_per_minute:
|
|
199
|
+
with client.batch.rate_limit(
|
|
200
|
+
requests_per_minute=self.requests_per_minute
|
|
201
|
+
) as batch_client:
|
|
202
|
+
yield batch_client
|
|
203
|
+
else:
|
|
204
|
+
raise ValueError("No batch mode enabled")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@dataclass
|
|
208
|
+
class WeaviateUploader(Uploader, ABC):
|
|
209
|
+
upload_config: WeaviateUploaderConfig
|
|
210
|
+
connection_config: WeaviateConnectionConfig
|
|
211
|
+
|
|
212
|
+
def precheck(self) -> None:
|
|
213
|
+
try:
|
|
214
|
+
self.connection_config.get_client()
|
|
215
|
+
except Exception as e:
|
|
216
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
217
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
218
|
+
|
|
219
|
+
def check_for_errors(self, client: "WeaviateClient") -> None:
|
|
220
|
+
failed_uploads = client.batch.failed_objects
|
|
221
|
+
if failed_uploads:
|
|
222
|
+
for failure in failed_uploads:
|
|
223
|
+
logger.error(
|
|
224
|
+
f"Failed to upload object with id {failure.original_uuid}: {failure.message}"
|
|
225
|
+
)
|
|
226
|
+
raise WriteError("Failed to upload to weaviate")
|
|
227
|
+
|
|
228
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
229
|
+
def delete_by_record_id(self, client: "WeaviateClient", file_data: FileData) -> None:
|
|
230
|
+
from weaviate.classes.query import Filter
|
|
231
|
+
|
|
232
|
+
record_id = file_data.identifier
|
|
233
|
+
collection = client.collections.get(self.upload_config.collection)
|
|
234
|
+
delete_filter = Filter.by_property(name=self.upload_config.record_id_key).equal(
|
|
235
|
+
val=record_id
|
|
236
|
+
)
|
|
237
|
+
# There is a configurable maximum limit (QUERY_MAXIMUM_RESULTS) on the number of
|
|
238
|
+
# objects that can be deleted in a single query (default 10,000). To delete
|
|
239
|
+
# more objects than the limit, re-run the query until nothing is deleted.
|
|
240
|
+
while True:
|
|
241
|
+
resp = collection.data.delete_many(where=delete_filter)
|
|
242
|
+
if resp.failed:
|
|
243
|
+
raise WriteError(
|
|
244
|
+
f"failed to delete records in collection "
|
|
245
|
+
f"{self.upload_config.collection} with record "
|
|
246
|
+
f"id property {record_id}"
|
|
247
|
+
)
|
|
248
|
+
if not resp.failed and not resp.successful:
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
252
|
+
logger.info(
|
|
253
|
+
f"writing {len(data)} objects to destination "
|
|
254
|
+
f"class {self.connection_config.access_config} "
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
with self.connection_config.get_client() as weaviate_client:
|
|
258
|
+
self.delete_by_record_id(client=weaviate_client, file_data=file_data)
|
|
259
|
+
with self.upload_config.get_batch_client(client=weaviate_client) as batch_client:
|
|
260
|
+
for e in data:
|
|
261
|
+
vector = e.pop("embeddings", None)
|
|
262
|
+
batch_client.add_object(
|
|
263
|
+
collection=self.upload_config.collection,
|
|
264
|
+
properties=e,
|
|
265
|
+
vector=vector,
|
|
266
|
+
)
|
|
267
|
+
self.check_for_errors(client=weaviate_client)
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field, SecretStr
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EmbedderConfig(BaseModel):
|
|
16
|
+
embedding_provider: Optional[
|
|
17
|
+
Literal[
|
|
18
|
+
"openai",
|
|
19
|
+
"azure-openai",
|
|
20
|
+
"huggingface",
|
|
21
|
+
"aws-bedrock",
|
|
22
|
+
"vertexai",
|
|
23
|
+
"voyageai",
|
|
24
|
+
"octoai",
|
|
25
|
+
"mixedbread-ai",
|
|
26
|
+
"togetherai",
|
|
27
|
+
]
|
|
28
|
+
] = Field(default=None, description="Type of the embedding class to be used.")
|
|
29
|
+
embedding_api_key: Optional[SecretStr] = Field(
|
|
30
|
+
default=None,
|
|
31
|
+
description="API key for the embedding model, for the case an API key is needed.",
|
|
32
|
+
)
|
|
33
|
+
embedding_model_name: Optional[str] = Field(
|
|
34
|
+
default=None,
|
|
35
|
+
description="Embedding model name, if needed. "
|
|
36
|
+
"Chooses a particular LLM between different options, to embed with it.",
|
|
37
|
+
)
|
|
38
|
+
embedding_aws_access_key_id: Optional[str] = Field(
|
|
39
|
+
default=None, description="AWS access key used for AWS-based embedders, such as bedrock"
|
|
40
|
+
)
|
|
41
|
+
embedding_aws_secret_access_key: Optional[SecretStr] = Field(
|
|
42
|
+
default=None, description="AWS secret key used for AWS-based embedders, such as bedrock"
|
|
43
|
+
)
|
|
44
|
+
embedding_aws_region: Optional[str] = Field(
|
|
45
|
+
default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
|
|
46
|
+
)
|
|
47
|
+
embedding_azure_endpoint: Optional[str] = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
description="Your Azure endpoint, including the resource, "
|
|
50
|
+
"e.g. `https://example-resource.azure.openai.com/`",
|
|
51
|
+
)
|
|
52
|
+
embedding_azure_api_version: Optional[str] = Field(
|
|
53
|
+
description="Azure API version", default=None
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
57
|
+
from unstructured_ingest.embed.huggingface import (
|
|
58
|
+
HuggingFaceEmbeddingConfig,
|
|
59
|
+
HuggingFaceEmbeddingEncoder,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return HuggingFaceEmbeddingEncoder(
|
|
63
|
+
config=HuggingFaceEmbeddingConfig.model_validate(embedding_kwargs)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
67
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
68
|
+
|
|
69
|
+
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
|
|
70
|
+
|
|
71
|
+
def get_azure_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
72
|
+
from unstructured_ingest.embed.azure_openai import (
|
|
73
|
+
AzureOpenAIEmbeddingConfig,
|
|
74
|
+
AzureOpenAIEmbeddingEncoder,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
config_kwargs = {
|
|
78
|
+
"api_key": self.embedding_api_key,
|
|
79
|
+
"azure_endpoint": self.embedding_azure_endpoint,
|
|
80
|
+
}
|
|
81
|
+
if api_version := self.embedding_azure_api_version:
|
|
82
|
+
config_kwargs["api_version"] = api_version
|
|
83
|
+
if model_name := self.embedding_model_name:
|
|
84
|
+
config_kwargs["model_name"] = model_name
|
|
85
|
+
|
|
86
|
+
return AzureOpenAIEmbeddingEncoder(
|
|
87
|
+
config=AzureOpenAIEmbeddingConfig.model_validate(config_kwargs)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
91
|
+
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
92
|
+
|
|
93
|
+
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig.model_validate(embedding_kwargs))
|
|
94
|
+
|
|
95
|
+
def get_bedrock_embedder(self) -> "BaseEmbeddingEncoder":
|
|
96
|
+
from unstructured_ingest.embed.bedrock import (
|
|
97
|
+
BedrockEmbeddingConfig,
|
|
98
|
+
BedrockEmbeddingEncoder,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
return BedrockEmbeddingEncoder(
|
|
102
|
+
config=BedrockEmbeddingConfig(
|
|
103
|
+
aws_access_key_id=self.embedding_aws_access_key_id,
|
|
104
|
+
aws_secret_access_key=self.embedding_aws_secret_access_key.get_secret_value(),
|
|
105
|
+
region_name=self.embedding_aws_region,
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
110
|
+
from unstructured_ingest.embed.vertexai import (
|
|
111
|
+
VertexAIEmbeddingConfig,
|
|
112
|
+
VertexAIEmbeddingEncoder,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return VertexAIEmbeddingEncoder(
|
|
116
|
+
config=VertexAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def get_voyageai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
120
|
+
from unstructured_ingest.embed.voyageai import (
|
|
121
|
+
VoyageAIEmbeddingConfig,
|
|
122
|
+
VoyageAIEmbeddingEncoder,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return VoyageAIEmbeddingEncoder(
|
|
126
|
+
config=VoyageAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
def get_mixedbread_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
130
|
+
from unstructured_ingest.embed.mixedbreadai import (
|
|
131
|
+
MixedbreadAIEmbeddingConfig,
|
|
132
|
+
MixedbreadAIEmbeddingEncoder,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return MixedbreadAIEmbeddingEncoder(
|
|
136
|
+
config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def get_togetherai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
140
|
+
from unstructured_ingest.embed.togetherai import (
|
|
141
|
+
TogetherAIEmbeddingConfig,
|
|
142
|
+
TogetherAIEmbeddingEncoder,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return TogetherAIEmbeddingEncoder(
|
|
146
|
+
config=TogetherAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def get_embedder(self) -> "BaseEmbeddingEncoder":
|
|
150
|
+
kwargs: dict[str, Any] = {}
|
|
151
|
+
if self.embedding_api_key:
|
|
152
|
+
kwargs["api_key"] = self.embedding_api_key.get_secret_value()
|
|
153
|
+
if self.embedding_model_name:
|
|
154
|
+
kwargs["model_name"] = self.embedding_model_name
|
|
155
|
+
# TODO make this more dynamic to map to encoder configs
|
|
156
|
+
if self.embedding_provider == "openai":
|
|
157
|
+
return self.get_openai_embedder(embedding_kwargs=kwargs)
|
|
158
|
+
|
|
159
|
+
if self.embedding_provider == "huggingface":
|
|
160
|
+
return self.get_huggingface_embedder(embedding_kwargs=kwargs)
|
|
161
|
+
|
|
162
|
+
if self.embedding_provider == "octoai":
|
|
163
|
+
return self.get_octoai_embedder(embedding_kwargs=kwargs)
|
|
164
|
+
|
|
165
|
+
if self.embedding_provider == "aws-bedrock":
|
|
166
|
+
return self.get_bedrock_embedder()
|
|
167
|
+
|
|
168
|
+
if self.embedding_provider == "vertexai":
|
|
169
|
+
return self.get_vertexai_embedder(embedding_kwargs=kwargs)
|
|
170
|
+
|
|
171
|
+
if self.embedding_provider == "voyageai":
|
|
172
|
+
return self.get_voyageai_embedder(embedding_kwargs=kwargs)
|
|
173
|
+
if self.embedding_provider == "mixedbread-ai":
|
|
174
|
+
return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
|
|
175
|
+
if self.embedding_provider == "togetherai":
|
|
176
|
+
return self.get_togetherai_embedder(embedding_kwargs=kwargs)
|
|
177
|
+
if self.embedding_provider == "azure-openai":
|
|
178
|
+
return self.get_azure_openai_embedder(embedding_kwargs=kwargs)
|
|
179
|
+
|
|
180
|
+
raise ValueError(f"{self.embedding_provider} not a recognized encoder")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@dataclass
|
|
184
|
+
class Embedder(BaseProcess, ABC):
|
|
185
|
+
config: EmbedderConfig
|
|
186
|
+
|
|
187
|
+
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
188
|
+
# TODO update base embedder classes to support async
|
|
189
|
+
embedder = self.config.get_embedder()
|
|
190
|
+
with elements_filepath.open("r") as elements_file:
|
|
191
|
+
elements = json.load(elements_file)
|
|
192
|
+
if not elements:
|
|
193
|
+
return [e.to_dict() for e in elements]
|
|
194
|
+
embedded_elements = embedder.embed_documents(elements=elements)
|
|
195
|
+
return embedded_elements
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Callable, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
10
|
+
from unstructured_ingest.v2.logger import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FiltererConfig(BaseModel):
|
|
14
|
+
file_glob: Optional[list[str]] = Field(
|
|
15
|
+
default=None,
|
|
16
|
+
description="file globs to limit which types of " "files are accepted",
|
|
17
|
+
examples=["*.pdf", "*.html"],
|
|
18
|
+
)
|
|
19
|
+
max_file_size: Optional[int] = Field(
|
|
20
|
+
default=None, description="Max file size to process in bytes"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Filterer(BaseProcess, ABC):
|
|
26
|
+
config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
|
|
27
|
+
filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)
|
|
28
|
+
|
|
29
|
+
def __post_init__(self):
|
|
30
|
+
# Populate the filters based on values in config
|
|
31
|
+
if self.config.file_glob is not None:
|
|
32
|
+
self.filters.append(self.glob_filter)
|
|
33
|
+
if self.config.max_file_size:
|
|
34
|
+
self.filters.append(self.file_size_filter)
|
|
35
|
+
|
|
36
|
+
def is_async(self) -> bool:
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
def file_size_filter(self, file_data: FileData) -> bool:
|
|
40
|
+
if filesize_bytes := file_data.metadata.filesize_bytes:
|
|
41
|
+
return filesize_bytes <= self.config.max_file_size
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
def glob_filter(self, file_data: FileData) -> bool:
|
|
45
|
+
patterns = self.config.file_glob
|
|
46
|
+
path = file_data.source_identifiers.fullpath
|
|
47
|
+
for pattern in patterns:
|
|
48
|
+
if fnmatch.filter([path], pattern):
|
|
49
|
+
return True
|
|
50
|
+
logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
|
|
54
|
+
for filter in self.filters:
|
|
55
|
+
if not filter(file_data):
|
|
56
|
+
logger.debug(
|
|
57
|
+
f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
|
|
58
|
+
)
|
|
59
|
+
return None
|
|
60
|
+
return file_data
|