unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, Secret, model_validator
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
9
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
ConnectionConfig,
|
|
14
|
+
FileData,
|
|
15
|
+
Uploader,
|
|
16
|
+
UploaderConfig,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.v2.logger import logger
|
|
19
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from redis.asyncio import Redis
|
|
23
|
+
|
|
24
|
+
import asyncio
|
|
25
|
+
|
|
26
|
+
CONNECTOR_TYPE = "redis"
|
|
27
|
+
SERVER_API_VERSION = "1"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class RedisAccessConfig(AccessConfig):
|
|
31
|
+
uri: Optional[str] = Field(
|
|
32
|
+
default=None, description="If not anonymous, use this uri, if specified."
|
|
33
|
+
)
|
|
34
|
+
password: Optional[str] = Field(
|
|
35
|
+
default=None, description="If not anonymous, use this password, if specified."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class RedisConnectionConfig(ConnectionConfig):
|
|
40
|
+
access_config: Secret[RedisAccessConfig] = Field(
|
|
41
|
+
default=RedisAccessConfig(), validate_default=True
|
|
42
|
+
)
|
|
43
|
+
host: Optional[str] = Field(
|
|
44
|
+
default=None, description="Hostname or IP address of a Redis instance to connect to."
|
|
45
|
+
)
|
|
46
|
+
database: int = Field(default=0, description="Database index to connect to.")
|
|
47
|
+
port: int = Field(default=6379, description="port used to connect to database.")
|
|
48
|
+
username: Optional[str] = Field(
|
|
49
|
+
default=None, description="Username used to connect to database."
|
|
50
|
+
)
|
|
51
|
+
ssl: bool = Field(default=True, description="Whether the connection should use SSL encryption.")
|
|
52
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
53
|
+
|
|
54
|
+
@model_validator(mode="after")
|
|
55
|
+
def validate_host_or_url(self) -> "RedisConnectionConfig":
|
|
56
|
+
if not self.access_config.get_secret_value().uri and not self.host:
|
|
57
|
+
raise ValueError("Please pass a hostname either directly or through uri")
|
|
58
|
+
return self
|
|
59
|
+
|
|
60
|
+
@requires_dependencies(["redis"], extras="redis")
|
|
61
|
+
@asynccontextmanager
|
|
62
|
+
async def create_async_client(self) -> AsyncGenerator["Redis", None]:
|
|
63
|
+
from redis.asyncio import Redis, from_url
|
|
64
|
+
|
|
65
|
+
access_config = self.access_config.get_secret_value()
|
|
66
|
+
|
|
67
|
+
options = {
|
|
68
|
+
"host": self.host,
|
|
69
|
+
"port": self.port,
|
|
70
|
+
"db": self.database,
|
|
71
|
+
"ssl": self.ssl,
|
|
72
|
+
"username": self.username,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if access_config.password:
|
|
76
|
+
options["password"] = access_config.password
|
|
77
|
+
|
|
78
|
+
if access_config.uri:
|
|
79
|
+
async with from_url(access_config.uri) as client:
|
|
80
|
+
yield client
|
|
81
|
+
else:
|
|
82
|
+
async with Redis(**options) as client:
|
|
83
|
+
yield client
|
|
84
|
+
|
|
85
|
+
@requires_dependencies(["redis"], extras="redis")
|
|
86
|
+
@contextmanager
|
|
87
|
+
def create_client(self) -> Generator["Redis", None, None]:
|
|
88
|
+
from redis import Redis, from_url
|
|
89
|
+
|
|
90
|
+
access_config = self.access_config.get_secret_value()
|
|
91
|
+
|
|
92
|
+
options = {
|
|
93
|
+
"host": self.host,
|
|
94
|
+
"port": self.port,
|
|
95
|
+
"db": self.database,
|
|
96
|
+
"ssl": self.ssl,
|
|
97
|
+
"username": self.username,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if access_config.password:
|
|
101
|
+
options["password"] = access_config.password
|
|
102
|
+
|
|
103
|
+
if access_config.uri:
|
|
104
|
+
with from_url(access_config.uri) as client:
|
|
105
|
+
yield client
|
|
106
|
+
else:
|
|
107
|
+
with Redis(**options) as client:
|
|
108
|
+
yield client
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class RedisUploaderConfig(UploaderConfig):
|
|
112
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class RedisUploader(Uploader):
|
|
117
|
+
upload_config: RedisUploaderConfig
|
|
118
|
+
connection_config: RedisConnectionConfig
|
|
119
|
+
connector_type: str = CONNECTOR_TYPE
|
|
120
|
+
|
|
121
|
+
def is_async(self) -> bool:
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
def precheck(self) -> None:
|
|
125
|
+
try:
|
|
126
|
+
with self.connection_config.create_client() as client:
|
|
127
|
+
client.ping()
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
130
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
131
|
+
|
|
132
|
+
async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
133
|
+
first_element = data[0]
|
|
134
|
+
redis_stack = await self._check_redis_stack(first_element)
|
|
135
|
+
logger.info(
|
|
136
|
+
f"writing {len(data)} objects to destination asynchronously, "
|
|
137
|
+
f"db, {self.connection_config.database}, "
|
|
138
|
+
f"at {self.connection_config.host}",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
batches = list(batch_generator(data, batch_size=self.upload_config.batch_size))
|
|
142
|
+
await asyncio.gather(*[self._write_batch(batch, redis_stack) for batch in batches])
|
|
143
|
+
|
|
144
|
+
async def _write_batch(self, batch: list[dict], redis_stack: bool) -> None:
|
|
145
|
+
async with self.connection_config.create_async_client() as async_client:
|
|
146
|
+
async with async_client.pipeline(transaction=True) as pipe:
|
|
147
|
+
for element in batch:
|
|
148
|
+
element_id = element["element_id"]
|
|
149
|
+
if redis_stack:
|
|
150
|
+
pipe.json().set(element_id, "$", element)
|
|
151
|
+
else:
|
|
152
|
+
pipe.set(element_id, json.dumps(element))
|
|
153
|
+
await pipe.execute()
|
|
154
|
+
|
|
155
|
+
@requires_dependencies(["redis"], extras="redis")
|
|
156
|
+
async def _check_redis_stack(self, element: dict) -> bool:
|
|
157
|
+
from redis import exceptions as redis_exceptions
|
|
158
|
+
|
|
159
|
+
redis_stack = True
|
|
160
|
+
async with self.connection_config.create_async_client() as async_client:
|
|
161
|
+
async with async_client.pipeline(transaction=True) as pipe:
|
|
162
|
+
element_id = element["element_id"]
|
|
163
|
+
try:
|
|
164
|
+
# Redis with stack extension supports JSON type
|
|
165
|
+
await pipe.json().set(element_id, "$", element).execute()
|
|
166
|
+
except redis_exceptions.ResponseError as e:
|
|
167
|
+
message = str(e)
|
|
168
|
+
if "unknown command `JSON.SET`" in message:
|
|
169
|
+
# if this error occurs, Redis server doesn't support JSON type,
|
|
170
|
+
# so save as string type instead
|
|
171
|
+
await pipe.set(element_id, json.dumps(element)).execute()
|
|
172
|
+
redis_stack = False
|
|
173
|
+
else:
|
|
174
|
+
raise e
|
|
175
|
+
return redis_stack
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
redis_destination_entry = DestinationRegistryEntry(
|
|
179
|
+
connection_config=RedisConnectionConfig,
|
|
180
|
+
uploader=RedisUploader,
|
|
181
|
+
uploader_config=RedisUploaderConfig,
|
|
182
|
+
)
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Salesforce Connector
|
|
3
|
+
Able to download Account, Case, Campaign, EmailMessage, Lead
|
|
4
|
+
Salesforce returns everything as a list of json.
|
|
5
|
+
This saves each entry as a separate file to be partitioned.
|
|
6
|
+
Using JWT authorization
|
|
7
|
+
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm
|
|
8
|
+
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from collections import OrderedDict
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from email.utils import formatdate
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from string import Template
|
|
17
|
+
from textwrap import dedent
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Type
|
|
19
|
+
|
|
20
|
+
from dateutil import parser
|
|
21
|
+
from pydantic import Field, Secret
|
|
22
|
+
|
|
23
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
24
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
|
+
from unstructured_ingest.v2.interfaces import (
|
|
26
|
+
AccessConfig,
|
|
27
|
+
ConnectionConfig,
|
|
28
|
+
Downloader,
|
|
29
|
+
DownloaderConfig,
|
|
30
|
+
DownloadResponse,
|
|
31
|
+
FileData,
|
|
32
|
+
FileDataSourceMetadata,
|
|
33
|
+
Indexer,
|
|
34
|
+
IndexerConfig,
|
|
35
|
+
SourceIdentifiers,
|
|
36
|
+
)
|
|
37
|
+
from unstructured_ingest.v2.logger import logger
|
|
38
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
39
|
+
SourceRegistryEntry,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class MissingCategoryError(Exception):
|
|
44
|
+
"""There are no categories with that name."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
CONNECTOR_TYPE = "salesforce"
|
|
48
|
+
|
|
49
|
+
if TYPE_CHECKING:
|
|
50
|
+
from simple_salesforce import Salesforce
|
|
51
|
+
|
|
52
|
+
SALESFORCE_API_VERSION = "57.0"
|
|
53
|
+
|
|
54
|
+
# TODO: Add more categories as needed
|
|
55
|
+
ACCEPTED_CATEGORIES: list[str] = ["Account", "Case", "Campaign", "EmailMessage", "Lead"]
|
|
56
|
+
|
|
57
|
+
# Generic minimal email template used only
|
|
58
|
+
# to process EmailMessage records as .eml files
|
|
59
|
+
EMAIL_TEMPLATE = Template(
|
|
60
|
+
"""MIME-Version: 1.0
|
|
61
|
+
Date: $date
|
|
62
|
+
Message-ID: $message_identifier
|
|
63
|
+
Subject: $subject
|
|
64
|
+
From: $from_email
|
|
65
|
+
To: $to_email
|
|
66
|
+
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
|
|
67
|
+
--00000000000095c9b205eff92630
|
|
68
|
+
Content-Type: text/plain; charset="UTF-8"
|
|
69
|
+
$textbody
|
|
70
|
+
--00000000000095c9b205eff92630
|
|
71
|
+
Content-Type: text/html; charset="UTF-8"
|
|
72
|
+
$htmlbody
|
|
73
|
+
--00000000000095c9b205eff92630--
|
|
74
|
+
""",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class SalesforceAccessConfig(AccessConfig):
|
|
79
|
+
consumer_key: str
|
|
80
|
+
private_key_path: Optional[Path] = Field(
|
|
81
|
+
default=None,
|
|
82
|
+
description="Path to the private key file. " "Key file is usually named server.key.",
|
|
83
|
+
)
|
|
84
|
+
private_key: Optional[str] = Field(default=None, description="Contents of the private key")
|
|
85
|
+
|
|
86
|
+
def model_post_init(self, __context: Any) -> None:
|
|
87
|
+
if self.private_key_path is None and self.private_key is None:
|
|
88
|
+
raise ValueError("either private_key or private_key_path must be set")
|
|
89
|
+
if self.private_key is not None and self.private_key_path is not None:
|
|
90
|
+
raise ValueError("only one of private_key or private_key_path must be set")
|
|
91
|
+
|
|
92
|
+
@requires_dependencies(["cryptography"])
|
|
93
|
+
def get_private_key_value_and_type(self) -> tuple[str, Type]:
|
|
94
|
+
from cryptography.hazmat.primitives import serialization
|
|
95
|
+
|
|
96
|
+
if self.private_key_path and self.private_key_path.is_file():
|
|
97
|
+
return str(self.private_key_path), Path
|
|
98
|
+
if self.private_key:
|
|
99
|
+
try:
|
|
100
|
+
serialization.load_pem_private_key(
|
|
101
|
+
data=str(self.private_key).encode("utf-8"), password=None
|
|
102
|
+
)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
raise ValueError(f"failed to validate private key data: {e}") from e
|
|
105
|
+
return self.private_key, str
|
|
106
|
+
|
|
107
|
+
raise ValueError("private_key does not contain PEM private key or path")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class SalesforceConnectionConfig(ConnectionConfig):
|
|
111
|
+
username: str
|
|
112
|
+
access_config: Secret[SalesforceAccessConfig]
|
|
113
|
+
|
|
114
|
+
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
115
|
+
def get_client(self) -> "Salesforce":
|
|
116
|
+
from simple_salesforce import Salesforce
|
|
117
|
+
|
|
118
|
+
access_config = self.access_config.get_secret_value()
|
|
119
|
+
pkey_value, pkey_type = access_config.get_private_key_value_and_type()
|
|
120
|
+
|
|
121
|
+
return Salesforce(
|
|
122
|
+
username=self.username,
|
|
123
|
+
consumer_key=access_config.consumer_key,
|
|
124
|
+
privatekey_file=pkey_value if pkey_type is Path else None,
|
|
125
|
+
privatekey=pkey_value if pkey_type is str else None,
|
|
126
|
+
version=SALESFORCE_API_VERSION,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class SalesforceIndexerConfig(IndexerConfig):
|
|
131
|
+
categories: list[str]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass
|
|
135
|
+
class SalesforceIndexer(Indexer):
|
|
136
|
+
connection_config: SalesforceConnectionConfig
|
|
137
|
+
index_config: SalesforceIndexerConfig
|
|
138
|
+
|
|
139
|
+
def __post_init__(self):
|
|
140
|
+
for record_type in self.index_config.categories:
|
|
141
|
+
if record_type not in ACCEPTED_CATEGORIES:
|
|
142
|
+
raise ValueError(f"{record_type} not currently an accepted Salesforce category")
|
|
143
|
+
|
|
144
|
+
def precheck(self) -> None:
|
|
145
|
+
try:
|
|
146
|
+
self.connection_config.get_client()
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
149
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
150
|
+
|
|
151
|
+
def get_file_extension(self, record_type) -> str:
|
|
152
|
+
if record_type == "EmailMessage":
|
|
153
|
+
extension = ".eml"
|
|
154
|
+
elif record_type in ["Account", "Lead", "Case", "Campaign"]:
|
|
155
|
+
extension = ".xml"
|
|
156
|
+
else:
|
|
157
|
+
raise MissingCategoryError(
|
|
158
|
+
f"There are no categories with the name: {record_type}",
|
|
159
|
+
)
|
|
160
|
+
return extension
|
|
161
|
+
|
|
162
|
+
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
163
|
+
def list_files(self) -> list[FileData]:
|
|
164
|
+
"""Get Salesforce Ids for the records.
|
|
165
|
+
Send them to next phase where each doc gets downloaded into the
|
|
166
|
+
appropriate format for partitioning.
|
|
167
|
+
"""
|
|
168
|
+
from simple_salesforce.exceptions import SalesforceMalformedRequest
|
|
169
|
+
|
|
170
|
+
client = self.connection_config.get_client()
|
|
171
|
+
|
|
172
|
+
files_list = []
|
|
173
|
+
for record_type in self.index_config.categories:
|
|
174
|
+
try:
|
|
175
|
+
# Get ids from Salesforce
|
|
176
|
+
records = client.query_all_iter(
|
|
177
|
+
f"select Id, SystemModstamp, CreatedDate, LastModifiedDate from {record_type}",
|
|
178
|
+
)
|
|
179
|
+
for record in records:
|
|
180
|
+
record_with_extension = record["Id"] + self.get_file_extension(
|
|
181
|
+
record["attributes"]["type"]
|
|
182
|
+
)
|
|
183
|
+
files_list.append(
|
|
184
|
+
FileData(
|
|
185
|
+
connector_type=CONNECTOR_TYPE,
|
|
186
|
+
identifier=record["Id"],
|
|
187
|
+
source_identifiers=SourceIdentifiers(
|
|
188
|
+
filename=record_with_extension,
|
|
189
|
+
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
190
|
+
),
|
|
191
|
+
metadata=FileDataSourceMetadata(
|
|
192
|
+
url=record["attributes"]["url"],
|
|
193
|
+
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
|
|
194
|
+
date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
|
|
195
|
+
date_modified=str(
|
|
196
|
+
parser.parse(record["LastModifiedDate"]).timestamp()
|
|
197
|
+
),
|
|
198
|
+
record_locator={"id": record["Id"]},
|
|
199
|
+
),
|
|
200
|
+
additional_metadata={"record_type": record["attributes"]["type"]},
|
|
201
|
+
)
|
|
202
|
+
)
|
|
203
|
+
except SalesforceMalformedRequest as e:
|
|
204
|
+
raise SalesforceMalformedRequest(f"Problem with Salesforce query: {e}")
|
|
205
|
+
|
|
206
|
+
return files_list
|
|
207
|
+
|
|
208
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
209
|
+
for f in self.list_files():
|
|
210
|
+
yield f
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class SalesforceDownloaderConfig(DownloaderConfig):
|
|
214
|
+
pass
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@dataclass
|
|
218
|
+
class SalesforceDownloader(Downloader):
|
|
219
|
+
connection_config: SalesforceConnectionConfig
|
|
220
|
+
download_config: SalesforceDownloaderConfig = field(
|
|
221
|
+
default_factory=lambda: SalesforceDownloaderConfig()
|
|
222
|
+
)
|
|
223
|
+
connector_type: str = CONNECTOR_TYPE
|
|
224
|
+
|
|
225
|
+
def _xml_for_record(self, record: OrderedDict) -> str:
|
|
226
|
+
"""Creates partitionable xml file from a record"""
|
|
227
|
+
import xml.etree.ElementTree as ET
|
|
228
|
+
|
|
229
|
+
def create_xml_doc(data, parent, prefix=""):
|
|
230
|
+
for key, value in data.items():
|
|
231
|
+
if isinstance(value, OrderedDict):
|
|
232
|
+
create_xml_doc(value, parent, prefix=f"{prefix}{key}.")
|
|
233
|
+
else:
|
|
234
|
+
item = ET.Element("item")
|
|
235
|
+
item.text = f"{prefix}{key}: {value}"
|
|
236
|
+
parent.append(item)
|
|
237
|
+
|
|
238
|
+
root = ET.Element("root")
|
|
239
|
+
create_xml_doc(record, root)
|
|
240
|
+
|
|
241
|
+
xml_string = ET.tostring(root, encoding="utf-8", xml_declaration=True).decode()
|
|
242
|
+
return xml_string
|
|
243
|
+
|
|
244
|
+
def _eml_for_record(self, email_json: dict[str, Any]) -> str:
|
|
245
|
+
"""Recreates standard expected .eml format using template."""
|
|
246
|
+
eml = EMAIL_TEMPLATE.substitute(
|
|
247
|
+
date=formatdate(parser.parse(email_json.get("MessageDate")).timestamp()),
|
|
248
|
+
message_identifier=email_json.get("MessageIdentifier"),
|
|
249
|
+
subject=email_json.get("Subject"),
|
|
250
|
+
from_email=email_json.get("FromAddress"),
|
|
251
|
+
to_email=email_json.get("ToAddress"),
|
|
252
|
+
textbody=email_json.get("TextBody"),
|
|
253
|
+
htmlbody=email_json.get("HtmlBody"),
|
|
254
|
+
)
|
|
255
|
+
return dedent(eml)
|
|
256
|
+
|
|
257
|
+
@SourceConnectionNetworkError.wrap
|
|
258
|
+
def _get_response(self, file_data: FileData) -> OrderedDict:
|
|
259
|
+
client = self.connection_config.get_client()
|
|
260
|
+
return client.query(
|
|
261
|
+
f"select FIELDS(STANDARD) from {file_data.additional_metadata['record_type']} where Id='{file_data.identifier}'", # noqa: E501
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def get_record(self, file_data: FileData) -> OrderedDict:
|
|
265
|
+
# Get record from Salesforce based on id
|
|
266
|
+
response = self._get_response(file_data)
|
|
267
|
+
logger.debug(f"response was returned for salesforce record id: {file_data.identifier}")
|
|
268
|
+
records = response["records"]
|
|
269
|
+
if not records:
|
|
270
|
+
raise ValueError(
|
|
271
|
+
f"No record found with record id {file_data.identifier}: {json.dumps(response)}"
|
|
272
|
+
)
|
|
273
|
+
record_json = records[0]
|
|
274
|
+
return record_json
|
|
275
|
+
|
|
276
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
277
|
+
record = self.get_record(file_data)
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
if file_data.additional_metadata["record_type"] == "EmailMessage":
|
|
281
|
+
document = self._eml_for_record(record)
|
|
282
|
+
else:
|
|
283
|
+
document = self._xml_for_record(record)
|
|
284
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
285
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
286
|
+
|
|
287
|
+
with open(download_path, "w") as page_file:
|
|
288
|
+
page_file.write(document)
|
|
289
|
+
|
|
290
|
+
except Exception as e:
|
|
291
|
+
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
292
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
293
|
+
|
|
294
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
salesforce_source_entry = SourceRegistryEntry(
|
|
298
|
+
connection_config=SalesforceConnectionConfig,
|
|
299
|
+
indexer_config=SalesforceIndexerConfig,
|
|
300
|
+
indexer=SalesforceIndexer,
|
|
301
|
+
downloader_config=SalesforceDownloaderConfig,
|
|
302
|
+
downloader=SalesforceDownloader,
|
|
303
|
+
)
|