unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from time import time
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
+
from urllib.parse import quote
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field, Secret, SecretStr
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.interfaces import (
|
|
14
|
+
AccessConfig,
|
|
15
|
+
ConnectionConfig,
|
|
16
|
+
Downloader,
|
|
17
|
+
DownloaderConfig,
|
|
18
|
+
DownloadResponse,
|
|
19
|
+
FileData,
|
|
20
|
+
FileDataSourceMetadata,
|
|
21
|
+
Indexer,
|
|
22
|
+
IndexerConfig,
|
|
23
|
+
SourceIdentifiers,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.logger import logger
|
|
26
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
27
|
+
SourceRegistryEntry,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
from .utils import parse_datetime
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from office365.graph_client import GraphClient
|
|
34
|
+
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
35
|
+
from office365.onedrive.drives.drive import Drive
|
|
36
|
+
from office365.onedrive.permissions.permission import Permission
|
|
37
|
+
from office365.onedrive.sites.site import Site
|
|
38
|
+
from office365.sharepoint.client_context import ClientContext
|
|
39
|
+
from office365.sharepoint.files.file import File
|
|
40
|
+
from office365.sharepoint.folders.folder import Folder
|
|
41
|
+
from office365.sharepoint.publishing.pages.page import SitePage
|
|
42
|
+
|
|
43
|
+
CONNECTOR_TYPE = "sharepoint"
|
|
44
|
+
|
|
45
|
+
MAX_MB_SIZE = 512_000_000
|
|
46
|
+
|
|
47
|
+
# TODO handle other data types possible from Sharepoint
|
|
48
|
+
# exampled: https://github.com/vgrem/Office365-REST-Python-Client/tree/master/examples/sharepoint
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class SharepointContentType(Enum):
|
|
52
|
+
DOCUMENT = "document"
|
|
53
|
+
SITEPAGE = "site_page"
|
|
54
|
+
LIST = "list"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SharepointAccessConfig(AccessConfig):
|
|
58
|
+
client_cred: str = Field(description="Sharepoint app secret")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class SharepointPermissionsConfig(BaseModel):
|
|
62
|
+
permissions_application_id: Optional[str] = Field(
|
|
63
|
+
default=None, description="Microsoft Graph API application id"
|
|
64
|
+
)
|
|
65
|
+
permissions_tenant: Optional[str] = Field(
|
|
66
|
+
default=None,
|
|
67
|
+
description="url to get permissions data within tenant.",
|
|
68
|
+
examples=["https://contoso.onmicrosoft.com"],
|
|
69
|
+
)
|
|
70
|
+
permissions_client_cred: Optional[SecretStr] = Field(
|
|
71
|
+
default=None, description="Microsoft Graph API application credentials"
|
|
72
|
+
)
|
|
73
|
+
authority_url: Optional[SecretStr] = Field(
|
|
74
|
+
repr=False,
|
|
75
|
+
default_factory=lambda: SecretStr(secret_value="https://login.microsoftonline.com"),
|
|
76
|
+
description="Permissions authority url",
|
|
77
|
+
examples=["https://login.microsoftonline.com"],
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class SharepointConnectionConfig(ConnectionConfig):
|
|
82
|
+
client_id: str = Field(description="Sharepoint app client ID")
|
|
83
|
+
site: str = Field(
|
|
84
|
+
description="Sharepoint site url. Process either base url e.g \
|
|
85
|
+
https://[tenant].sharepoint.com or relative sites \
|
|
86
|
+
https://[tenant].sharepoint.com/sites/<site_name>. \
|
|
87
|
+
To process all sites within the tenant pass a site url as \
|
|
88
|
+
https://[tenant]-admin.sharepoint.com.\
|
|
89
|
+
This requires the app to be registered at a tenant level"
|
|
90
|
+
)
|
|
91
|
+
access_config: Secret[SharepointAccessConfig]
|
|
92
|
+
permissions_config: Optional[SharepointPermissionsConfig] = None
|
|
93
|
+
|
|
94
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
95
|
+
def get_client(self) -> "ClientContext":
|
|
96
|
+
from office365.runtime.auth.client_credential import ClientCredential
|
|
97
|
+
from office365.sharepoint.client_context import ClientContext
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
credentials = ClientCredential(
|
|
101
|
+
self.client_id, self.access_config.get_secret_value().client_cred
|
|
102
|
+
)
|
|
103
|
+
site_client = ClientContext(self.site).with_credentials(credentials)
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.error(f"Couldn't set Sharepoint client: {e}")
|
|
106
|
+
raise e
|
|
107
|
+
return site_client
|
|
108
|
+
|
|
109
|
+
@requires_dependencies(["msal"], extras="sharepoint")
|
|
110
|
+
def get_permissions_token(self):
|
|
111
|
+
from msal import ConfidentialClientApplication
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
client_credential = self.permissions_config.permissions_client_cred.get_secret_value()
|
|
115
|
+
app = ConfidentialClientApplication(
|
|
116
|
+
authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
|
|
117
|
+
f"{self.permissions_config.permissions_tenant}",
|
|
118
|
+
client_id=self.permissions_config.permissions_application_id,
|
|
119
|
+
client_credential=client_credential,
|
|
120
|
+
)
|
|
121
|
+
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
|
122
|
+
except ValueError as exc:
|
|
123
|
+
logger.error("Couldn't set up credentials for Sharepoint")
|
|
124
|
+
raise exc
|
|
125
|
+
if "error" in token:
|
|
126
|
+
raise SourceConnectionNetworkError(
|
|
127
|
+
"failed to fetch token, {}: {}".format(token["error"], token["error_description"])
|
|
128
|
+
)
|
|
129
|
+
return token
|
|
130
|
+
|
|
131
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
132
|
+
def get_permissions_client(self) -> Optional["GraphClient"]:
|
|
133
|
+
from office365.graph_client import GraphClient
|
|
134
|
+
|
|
135
|
+
if self.permissions_config is None:
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
client = GraphClient(self.get_permissions_token)
|
|
139
|
+
return client
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class SharepointIndexerConfig(IndexerConfig):
|
|
143
|
+
path: Optional[str] = Field(
|
|
144
|
+
default=None,
|
|
145
|
+
description="Path from which to start parsing files. If the connector is to \
|
|
146
|
+
process all sites within the tenant this filter will be applied to \
|
|
147
|
+
all sites document libraries.",
|
|
148
|
+
)
|
|
149
|
+
recursive: bool = Field(
|
|
150
|
+
default=False,
|
|
151
|
+
description="Recursively download files in their respective folders "
|
|
152
|
+
"otherwise stop at the files in provided folder level.",
|
|
153
|
+
)
|
|
154
|
+
omit_files: bool = Field(default=False, description="Don't process files.")
|
|
155
|
+
omit_pages: bool = Field(default=False, description="Don't process site pages.")
|
|
156
|
+
omit_lists: bool = Field(default=False, description="Don't process lists.")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclass
|
|
160
|
+
class SharepointIndexer(Indexer):
|
|
161
|
+
connection_config: SharepointConnectionConfig
|
|
162
|
+
index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
|
|
163
|
+
|
|
164
|
+
def precheck(self) -> None:
|
|
165
|
+
try:
|
|
166
|
+
site_client = self.connection_config.get_client()
|
|
167
|
+
site_client.site_pages.pages.get().execute_query()
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
170
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
171
|
+
|
|
172
|
+
def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
|
|
173
|
+
if not recursive:
|
|
174
|
+
folder.expand(["Files"]).get().execute_query()
|
|
175
|
+
return folder.files
|
|
176
|
+
|
|
177
|
+
folder.expand(["Files", "Folders"]).get().execute_query()
|
|
178
|
+
files: list["File"] = list(folder.files)
|
|
179
|
+
folders: list["Folder"] = list(folder.folders)
|
|
180
|
+
for f in folders:
|
|
181
|
+
if "/Forms" in f.serverRelativeUrl:
|
|
182
|
+
continue
|
|
183
|
+
files.extend(self.list_files(f, recursive))
|
|
184
|
+
return files
|
|
185
|
+
|
|
186
|
+
def get_properties(self, raw_properties: dict) -> dict:
|
|
187
|
+
raw_properties = {k: v for k, v in raw_properties.items() if v}
|
|
188
|
+
filtered_properties = {}
|
|
189
|
+
for k, v in raw_properties.items():
|
|
190
|
+
try:
|
|
191
|
+
json.dumps(v)
|
|
192
|
+
filtered_properties[k] = v
|
|
193
|
+
except TypeError:
|
|
194
|
+
pass
|
|
195
|
+
return filtered_properties
|
|
196
|
+
|
|
197
|
+
def list_pages(self, client: "ClientContext") -> list["SitePage"]:
|
|
198
|
+
pages = client.site_pages.pages.get().execute_query()
|
|
199
|
+
return pages
|
|
200
|
+
|
|
201
|
+
def page_to_file_data(self, site_page: "SitePage") -> FileData:
|
|
202
|
+
site_page.expand(site_page.properties.keys()).get().execute_query()
|
|
203
|
+
version = site_page.properties.get("Version", None)
|
|
204
|
+
unique_id = site_page.properties.get("UniqueId", None)
|
|
205
|
+
modified_date = site_page.properties.get("Modified", None)
|
|
206
|
+
url = site_page.properties.get("AbsoluteUrl", None)
|
|
207
|
+
date_modified_dt = parse_datetime(modified_date) if modified_date else None
|
|
208
|
+
date_created_at = (
|
|
209
|
+
parse_datetime(site_page.first_published)
|
|
210
|
+
if (site_page.first_published and site_page.first_published != "0001-01-01T08:00:00Z")
|
|
211
|
+
else None
|
|
212
|
+
)
|
|
213
|
+
file_path = site_page.get_property("Url", "")
|
|
214
|
+
server_path = file_path if file_path[0] != "/" else file_path[1:]
|
|
215
|
+
additional_metadata = self.get_properties(raw_properties=site_page.properties)
|
|
216
|
+
additional_metadata["sharepoint_content_type"] = SharepointContentType.SITEPAGE.value
|
|
217
|
+
return FileData(
|
|
218
|
+
identifier=unique_id,
|
|
219
|
+
connector_type=CONNECTOR_TYPE,
|
|
220
|
+
source_identifiers=SourceIdentifiers(
|
|
221
|
+
filename=site_page.file_name,
|
|
222
|
+
fullpath=file_path,
|
|
223
|
+
rel_path=file_path.replace(self.index_config.path, ""),
|
|
224
|
+
),
|
|
225
|
+
metadata=FileDataSourceMetadata(
|
|
226
|
+
url=url,
|
|
227
|
+
version=version,
|
|
228
|
+
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
229
|
+
date_created=str(date_created_at.timestamp()) if date_created_at else None,
|
|
230
|
+
date_processed=str(time()),
|
|
231
|
+
record_locator={
|
|
232
|
+
"server_path": server_path,
|
|
233
|
+
},
|
|
234
|
+
),
|
|
235
|
+
additional_metadata=additional_metadata,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def file_to_file_data(self, client: "ClientContext", file: "File") -> FileData:
|
|
239
|
+
file.expand(file.properties.keys()).get().execute_query()
|
|
240
|
+
absolute_url = f"{client.base_url}{quote(file.serverRelativeUrl)}"
|
|
241
|
+
date_modified_dt = (
|
|
242
|
+
parse_datetime(file.time_last_modified) if file.time_last_modified else None
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
date_created_at = parse_datetime(file.time_created) if file.time_created else None
|
|
246
|
+
additional_metadata = self.get_properties(raw_properties=file.properties)
|
|
247
|
+
additional_metadata["sharepoint_content_type"] = SharepointContentType.DOCUMENT.value
|
|
248
|
+
fullpath = str(file.serverRelativeUrl)
|
|
249
|
+
rel_path = fullpath.replace(self.index_config.path, "")
|
|
250
|
+
while rel_path[0] == "/":
|
|
251
|
+
rel_path = rel_path[1:]
|
|
252
|
+
return FileData(
|
|
253
|
+
identifier=file.unique_id,
|
|
254
|
+
connector_type=CONNECTOR_TYPE,
|
|
255
|
+
source_identifiers=SourceIdentifiers(
|
|
256
|
+
filename=file.name,
|
|
257
|
+
fullpath=fullpath,
|
|
258
|
+
rel_path=rel_path,
|
|
259
|
+
),
|
|
260
|
+
metadata=FileDataSourceMetadata(
|
|
261
|
+
url=absolute_url,
|
|
262
|
+
version=f"{file.major_version}.{file.minor_version}",
|
|
263
|
+
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
264
|
+
date_created=str(date_created_at.timestamp()) if date_created_at else None,
|
|
265
|
+
date_processed=str(time()),
|
|
266
|
+
record_locator={"server_path": file.serverRelativeUrl, "site_url": client.base_url},
|
|
267
|
+
),
|
|
268
|
+
additional_metadata=additional_metadata,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
def get_root(self, client: "ClientContext") -> "Folder":
|
|
272
|
+
if path := self.index_config.path:
|
|
273
|
+
return client.web.get_folder_by_server_relative_path(path)
|
|
274
|
+
default_document_library = client.web.default_document_library()
|
|
275
|
+
root_folder = default_document_library.root_folder
|
|
276
|
+
root_folder = root_folder.get().execute_query()
|
|
277
|
+
self.index_config.path = root_folder.name
|
|
278
|
+
return root_folder
|
|
279
|
+
|
|
280
|
+
def get_site_url(self, client: "ClientContext") -> str:
|
|
281
|
+
res = client.web.get().execute_query()
|
|
282
|
+
return res.url
|
|
283
|
+
|
|
284
|
+
def get_site(self, permissions_client: "GraphClient", site_url) -> "Site":
|
|
285
|
+
return permissions_client.sites.get_by_url(url=site_url).execute_query()
|
|
286
|
+
|
|
287
|
+
def get_permissions_items(self, site: "Site") -> list["DriveItem"]:
|
|
288
|
+
# TODO find a way to narrow this search down by name of drive
|
|
289
|
+
items: list["DriveItem"] = []
|
|
290
|
+
drives: list["Drive"] = site.drives.get_all().execute_query()
|
|
291
|
+
for drive in drives:
|
|
292
|
+
items.extend(drive.root.children.get_all().execute_query())
|
|
293
|
+
return items
|
|
294
|
+
|
|
295
|
+
def map_permission(self, permission: "Permission") -> dict:
|
|
296
|
+
return {
|
|
297
|
+
"id": permission.id,
|
|
298
|
+
"roles": list(permission.roles),
|
|
299
|
+
"share_id": permission.share_id,
|
|
300
|
+
"has_password": permission.has_password,
|
|
301
|
+
"link": permission.link.to_json(),
|
|
302
|
+
"granted_to_identities": permission.granted_to_identities.to_json(),
|
|
303
|
+
"granted_to": permission.granted_to.to_json(),
|
|
304
|
+
"granted_to_v2": permission.granted_to_v2.to_json(),
|
|
305
|
+
"granted_to_identities_v2": permission.granted_to_identities_v2.to_json(),
|
|
306
|
+
"invitation": permission.invitation.to_json(),
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
def enrich_permissions_on_files(self, all_file_data: list[FileData], site_url: str) -> None:
|
|
310
|
+
logger.debug("Enriching permissions on files")
|
|
311
|
+
permission_client = self.connection_config.get_permissions_client()
|
|
312
|
+
if permission_client is None:
|
|
313
|
+
return
|
|
314
|
+
site = self.get_site(permissions_client=permission_client, site_url=site_url)
|
|
315
|
+
existing_items = self.get_permissions_items(site=site)
|
|
316
|
+
for file_data in all_file_data:
|
|
317
|
+
etag = file_data.additional_metadata.get("ETag")
|
|
318
|
+
if not etag:
|
|
319
|
+
continue
|
|
320
|
+
matching_items = list(filter(lambda x: x.etag == etag, existing_items))
|
|
321
|
+
if not matching_items:
|
|
322
|
+
continue
|
|
323
|
+
if len(matching_items) > 1:
|
|
324
|
+
logger.warning(
|
|
325
|
+
"Found multiple drive items with etag matching {}, skipping: {}".format(
|
|
326
|
+
etag, ", ".join([i.name for i in matching_items])
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
continue
|
|
330
|
+
matching_item = matching_items[0]
|
|
331
|
+
permissions: list["Permission"] = matching_item.permissions.get_all().execute_query()
|
|
332
|
+
permissions_data = [
|
|
333
|
+
self.map_permission(permission=permission) for permission in permissions
|
|
334
|
+
]
|
|
335
|
+
file_data.metadata.permissions_data = permissions_data
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def process_permissions(self) -> bool:
|
|
339
|
+
return (
|
|
340
|
+
self.connection_config.permissions_config is not None
|
|
341
|
+
and self.connection_config.permissions_config.permissions_tenant
|
|
342
|
+
and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
|
|
343
|
+
and self.connection_config.permissions_config.permissions_application_id
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
347
|
+
client = self.connection_config.get_client()
|
|
348
|
+
root_folder = self.get_root(client=client)
|
|
349
|
+
logger.debug(f"processing content from path: {self.index_config.path}")
|
|
350
|
+
if not self.index_config.omit_files:
|
|
351
|
+
files = self.list_files(root_folder, recursive=self.index_config.recursive)
|
|
352
|
+
file_data = [self.file_to_file_data(file=file, client=client) for file in files]
|
|
353
|
+
if self.process_permissions:
|
|
354
|
+
self.enrich_permissions_on_files(
|
|
355
|
+
all_file_data=file_data, site_url=self.get_site_url(client=client)
|
|
356
|
+
)
|
|
357
|
+
for file in file_data:
|
|
358
|
+
yield file
|
|
359
|
+
if not self.index_config.omit_pages:
|
|
360
|
+
pages = self.list_pages(client=client)
|
|
361
|
+
for page in pages:
|
|
362
|
+
file_data = self.page_to_file_data(site_page=page)
|
|
363
|
+
file_data.metadata.record_locator["site_url"] = client.base_url
|
|
364
|
+
yield file_data
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
class SharepointDownloaderConfig(DownloaderConfig):
|
|
368
|
+
pass
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
@dataclass
|
|
372
|
+
class SharepointDownloader(Downloader):
|
|
373
|
+
connection_config: SharepointConnectionConfig
|
|
374
|
+
download_config: SharepointDownloaderConfig
|
|
375
|
+
connector_type: str = CONNECTOR_TYPE
|
|
376
|
+
|
|
377
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
378
|
+
download_path = super().get_download_path(file_data=file_data)
|
|
379
|
+
|
|
380
|
+
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
381
|
+
if content_type == SharepointContentType.SITEPAGE.value:
|
|
382
|
+
# Update output extension to html if site page
|
|
383
|
+
download_path = download_path.with_suffix(".html")
|
|
384
|
+
return download_path
|
|
385
|
+
|
|
386
|
+
def get_document(self, file_data: FileData) -> DownloadResponse:
|
|
387
|
+
client: "ClientContext" = self.connection_config.get_client()
|
|
388
|
+
file: "File" = client.web.get_file_by_id(unique_id=file_data.identifier)
|
|
389
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
390
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
391
|
+
logger.debug(
|
|
392
|
+
f"writing document content {file_data.source_identifiers.fullpath} to {download_path}"
|
|
393
|
+
)
|
|
394
|
+
with download_path.open("wb") as f:
|
|
395
|
+
file.download(f).execute_query()
|
|
396
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
397
|
+
|
|
398
|
+
def get_site_page(self, file_data: FileData) -> DownloadResponse:
|
|
399
|
+
# TODO fetch comments for site page as well
|
|
400
|
+
from lxml import etree, html
|
|
401
|
+
|
|
402
|
+
canvas_content_raw = file_data.additional_metadata.get("CanvasContent1")
|
|
403
|
+
layout_web_parts_content_raw = file_data.additional_metadata.get("LayoutWebpartsContent")
|
|
404
|
+
html_content = []
|
|
405
|
+
if layout_web_parts_content_raw:
|
|
406
|
+
layout_web_parts_content = json.loads(layout_web_parts_content_raw)
|
|
407
|
+
for web_part in layout_web_parts_content:
|
|
408
|
+
properties = web_part.get("properties", {})
|
|
409
|
+
if title := properties.get("title"):
|
|
410
|
+
html_content.append(f"<title>{title}</title>")
|
|
411
|
+
if canvas_content_raw:
|
|
412
|
+
canvas_content = json.loads(canvas_content_raw)
|
|
413
|
+
for content in canvas_content:
|
|
414
|
+
if inner_html := content.get("innerHTML"):
|
|
415
|
+
html_content.append(inner_html)
|
|
416
|
+
htmls = "".join(html_content)
|
|
417
|
+
content = f"<div>{htmls}</div>"
|
|
418
|
+
document = html.fromstring(content)
|
|
419
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
420
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
421
|
+
logger.debug(
|
|
422
|
+
f"writing site page content {file_data.source_identifiers.filename} to {download_path}"
|
|
423
|
+
)
|
|
424
|
+
with download_path.open("w") as f:
|
|
425
|
+
f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
|
|
426
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
427
|
+
|
|
428
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
429
|
+
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
430
|
+
if not content_type:
|
|
431
|
+
raise ValueError(
|
|
432
|
+
f"Missing sharepoint_content_type metadata: {file_data.additional_metadata}"
|
|
433
|
+
)
|
|
434
|
+
if content_type == SharepointContentType.DOCUMENT.value:
|
|
435
|
+
return self.get_document(file_data=file_data)
|
|
436
|
+
elif content_type == SharepointContentType.SITEPAGE.value:
|
|
437
|
+
return self.get_site_page(file_data=file_data)
|
|
438
|
+
else:
|
|
439
|
+
raise ValueError(f"content type not recognized: {content_type}")
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
sharepoint_source_entry = SourceRegistryEntry(
|
|
443
|
+
connection_config=SharepointConnectionConfig,
|
|
444
|
+
indexer_config=SharepointIndexerConfig,
|
|
445
|
+
indexer=SharepointIndexer,
|
|
446
|
+
downloader_config=SharepointDownloaderConfig,
|
|
447
|
+
downloader=SharepointDownloader,
|
|
448
|
+
)
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import time
|
|
3
|
+
import xml.etree.ElementTree as ET
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
12
|
+
from unstructured_ingest.logger import logger
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
from unstructured_ingest.v2.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
Downloader,
|
|
18
|
+
DownloaderConfig,
|
|
19
|
+
DownloadResponse,
|
|
20
|
+
Indexer,
|
|
21
|
+
IndexerConfig,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.v2.interfaces.file_data import (
|
|
24
|
+
FileData,
|
|
25
|
+
FileDataSourceMetadata,
|
|
26
|
+
SourceIdentifiers,
|
|
27
|
+
)
|
|
28
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from slack_sdk import WebClient
|
|
32
|
+
from slack_sdk.web.async_client import AsyncWebClient
|
|
33
|
+
|
|
34
|
+
# NOTE: Pagination limit set to the upper end of the recommended range
|
|
35
|
+
# https://api.slack.com/apis/pagination#facts
|
|
36
|
+
PAGINATION_LIMIT = 200
|
|
37
|
+
|
|
38
|
+
CONNECTOR_TYPE = "slack"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class SlackAccessConfig(AccessConfig):
|
|
42
|
+
token: str = Field(
|
|
43
|
+
description="Bot token used to access Slack API, must have channels:history scope for the"
|
|
44
|
+
" bot user."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SlackConnectionConfig(ConnectionConfig):
|
|
49
|
+
access_config: Secret[SlackAccessConfig]
|
|
50
|
+
|
|
51
|
+
@requires_dependencies(["slack_sdk"], extras="slack")
|
|
52
|
+
@SourceConnectionError.wrap
|
|
53
|
+
def get_client(self) -> "WebClient":
|
|
54
|
+
from slack_sdk import WebClient
|
|
55
|
+
|
|
56
|
+
return WebClient(token=self.access_config.get_secret_value().token)
|
|
57
|
+
|
|
58
|
+
@requires_dependencies(["slack_sdk"], extras="slack")
|
|
59
|
+
@SourceConnectionError.wrap
|
|
60
|
+
def get_async_client(self) -> "AsyncWebClient":
|
|
61
|
+
from slack_sdk.web.async_client import AsyncWebClient
|
|
62
|
+
|
|
63
|
+
return AsyncWebClient(token=self.access_config.get_secret_value().token)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class SlackIndexerConfig(IndexerConfig):
|
|
67
|
+
channels: list[str] = Field(
|
|
68
|
+
description="Comma-delimited list of Slack channel IDs to pull messages from, can be"
|
|
69
|
+
" both public or private channels."
|
|
70
|
+
)
|
|
71
|
+
start_date: Optional[datetime] = Field(
|
|
72
|
+
default=None,
|
|
73
|
+
description="Start date/time in formats YYYY-MM-DD[T]HH:MM[:SS[.ffffff]][Z or [±]HH[:]MM]"
|
|
74
|
+
" or YYYY-MM-DD",
|
|
75
|
+
)
|
|
76
|
+
end_date: Optional[datetime] = Field(
|
|
77
|
+
default=None,
|
|
78
|
+
description="End date/time in formats YYYY-MM-DD[T]HH:MM[:SS[.ffffff]][Z or [±]HH[:]MM]"
|
|
79
|
+
" or YYYY-MM-DD",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class SlackIndexer(Indexer):
|
|
85
|
+
index_config: SlackIndexerConfig
|
|
86
|
+
connection_config: SlackConnectionConfig
|
|
87
|
+
connector_type: str = CONNECTOR_TYPE
|
|
88
|
+
|
|
89
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
90
|
+
client = self.connection_config.get_client()
|
|
91
|
+
for channel in self.index_config.channels:
|
|
92
|
+
messages = []
|
|
93
|
+
oldest = (
|
|
94
|
+
str(self.index_config.start_date.timestamp())
|
|
95
|
+
if self.index_config.start_date is not None
|
|
96
|
+
else None
|
|
97
|
+
)
|
|
98
|
+
latest = (
|
|
99
|
+
str(self.index_config.end_date.timestamp())
|
|
100
|
+
if self.index_config.end_date is not None
|
|
101
|
+
else None
|
|
102
|
+
)
|
|
103
|
+
for conversation_history in client.conversations_history(
|
|
104
|
+
channel=channel,
|
|
105
|
+
oldest=oldest,
|
|
106
|
+
latest=latest,
|
|
107
|
+
limit=PAGINATION_LIMIT,
|
|
108
|
+
):
|
|
109
|
+
messages = conversation_history.get("messages", [])
|
|
110
|
+
if messages:
|
|
111
|
+
yield self._messages_to_file_data(messages, channel)
|
|
112
|
+
|
|
113
|
+
def _messages_to_file_data(
|
|
114
|
+
self,
|
|
115
|
+
messages: list[dict],
|
|
116
|
+
channel: str,
|
|
117
|
+
) -> FileData:
|
|
118
|
+
ts_oldest = min((message["ts"] for message in messages), key=lambda m: float(m))
|
|
119
|
+
ts_newest = max((message["ts"] for message in messages), key=lambda m: float(m))
|
|
120
|
+
|
|
121
|
+
identifier_base = f"{channel}-{ts_oldest}-{ts_newest}"
|
|
122
|
+
identifier = hashlib.sha256(identifier_base.encode("utf-8")).hexdigest()
|
|
123
|
+
filename = identifier[:16]
|
|
124
|
+
|
|
125
|
+
return FileData(
|
|
126
|
+
identifier=identifier,
|
|
127
|
+
connector_type=CONNECTOR_TYPE,
|
|
128
|
+
source_identifiers=SourceIdentifiers(
|
|
129
|
+
filename=f"{filename}.xml", fullpath=f"{filename}.xml"
|
|
130
|
+
),
|
|
131
|
+
metadata=FileDataSourceMetadata(
|
|
132
|
+
date_created=ts_oldest,
|
|
133
|
+
date_modified=ts_newest,
|
|
134
|
+
date_processed=str(time.time()),
|
|
135
|
+
record_locator={
|
|
136
|
+
"channel": channel,
|
|
137
|
+
"oldest": ts_oldest,
|
|
138
|
+
"latest": ts_newest,
|
|
139
|
+
},
|
|
140
|
+
),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
@SourceConnectionError.wrap
|
|
144
|
+
def precheck(self) -> None:
|
|
145
|
+
client = self.connection_config.get_client()
|
|
146
|
+
for channel in self.index_config.channels:
|
|
147
|
+
# NOTE: Querying conversations history guarantees that the bot is in the channel
|
|
148
|
+
client.conversations_history(channel=channel, limit=1)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class SlackDownloaderConfig(DownloaderConfig):
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class SlackDownloader(Downloader):
|
|
157
|
+
connector_type: str = CONNECTOR_TYPE
|
|
158
|
+
connection_config: SlackConnectionConfig
|
|
159
|
+
download_config: SlackDownloaderConfig = field(default_factory=SlackDownloaderConfig)
|
|
160
|
+
|
|
161
|
+
def run(self, file_data, **kwargs):
|
|
162
|
+
raise NotImplementedError
|
|
163
|
+
|
|
164
|
+
async def run_async(self, file_data: FileData, **kwargs) -> DownloadResponse:
|
|
165
|
+
# NOTE: Indexer should provide source identifiers required to generate the download path
|
|
166
|
+
download_path = self.get_download_path(file_data)
|
|
167
|
+
if download_path is None:
|
|
168
|
+
logger.error(
|
|
169
|
+
"Generated download path is None, source_identifiers might be missing"
|
|
170
|
+
"from FileData."
|
|
171
|
+
)
|
|
172
|
+
raise ValueError("Generated invalid download path.")
|
|
173
|
+
|
|
174
|
+
await self._download_conversation(file_data, download_path)
|
|
175
|
+
return self.generate_download_response(file_data, download_path)
|
|
176
|
+
|
|
177
|
+
def is_async(self):
|
|
178
|
+
return True
|
|
179
|
+
|
|
180
|
+
async def _download_conversation(self, file_data: FileData, download_path: Path) -> None:
|
|
181
|
+
# NOTE: Indexer should supply the record locator in metadata
|
|
182
|
+
if (
|
|
183
|
+
file_data.metadata.record_locator is None
|
|
184
|
+
or "channel" not in file_data.metadata.record_locator
|
|
185
|
+
or "oldest" not in file_data.metadata.record_locator
|
|
186
|
+
or "latest" not in file_data.metadata.record_locator
|
|
187
|
+
):
|
|
188
|
+
logger.error(
|
|
189
|
+
f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
|
|
190
|
+
"Keys 'channel', 'oldest' and 'latest' must be present."
|
|
191
|
+
)
|
|
192
|
+
raise ValueError("Invalid record locator.")
|
|
193
|
+
|
|
194
|
+
client = self.connection_config.get_async_client()
|
|
195
|
+
messages = []
|
|
196
|
+
async for conversation_history in await client.conversations_history(
|
|
197
|
+
channel=file_data.metadata.record_locator["channel"],
|
|
198
|
+
oldest=file_data.metadata.record_locator["oldest"],
|
|
199
|
+
latest=file_data.metadata.record_locator["latest"],
|
|
200
|
+
limit=PAGINATION_LIMIT,
|
|
201
|
+
# NOTE: In order to get the exact same range of messages as indexer, it provides
|
|
202
|
+
# timestamps of oldest and newest messages, inclusive=True is necessary to include them
|
|
203
|
+
inclusive=True,
|
|
204
|
+
):
|
|
205
|
+
messages += conversation_history.get("messages", [])
|
|
206
|
+
|
|
207
|
+
conversation = []
|
|
208
|
+
for message in messages:
|
|
209
|
+
thread_messages = []
|
|
210
|
+
async for conversations_replies in await client.conversations_replies(
|
|
211
|
+
channel=file_data.metadata.record_locator["channel"],
|
|
212
|
+
ts=message["ts"],
|
|
213
|
+
limit=PAGINATION_LIMIT,
|
|
214
|
+
):
|
|
215
|
+
thread_messages += conversations_replies.get("messages", [])
|
|
216
|
+
|
|
217
|
+
# NOTE: Replies contains the whole thread, including the message references by the `ts`
|
|
218
|
+
# parameter even if it's the only message (there were no replies).
|
|
219
|
+
# Reference: https://api.slack.com/methods/conversations.replies#markdown
|
|
220
|
+
conversation.append(thread_messages)
|
|
221
|
+
|
|
222
|
+
conversation_xml = self._conversation_to_xml(conversation)
|
|
223
|
+
download_path.parent.mkdir(exist_ok=True, parents=True)
|
|
224
|
+
conversation_xml.write(download_path, encoding="utf-8", xml_declaration=True)
|
|
225
|
+
|
|
226
|
+
def _conversation_to_xml(self, conversation: list[list[dict]]) -> ET.ElementTree:
|
|
227
|
+
root = ET.Element("messages")
|
|
228
|
+
|
|
229
|
+
for thread in conversation:
|
|
230
|
+
message, *replies = thread
|
|
231
|
+
message_elem = ET.SubElement(root, "message")
|
|
232
|
+
text_elem = ET.SubElement(message_elem, "text")
|
|
233
|
+
text_elem.text = message.get("text")
|
|
234
|
+
|
|
235
|
+
for reply in replies:
|
|
236
|
+
reply_msg = reply.get("text", "")
|
|
237
|
+
text_elem.text = "".join([str(text_elem.text), " <reply> ", reply_msg])
|
|
238
|
+
|
|
239
|
+
return ET.ElementTree(root)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
slack_source_entry = SourceRegistryEntry(
|
|
243
|
+
indexer=SlackIndexer,
|
|
244
|
+
indexer_config=SlackIndexerConfig,
|
|
245
|
+
downloader=SlackDownloader,
|
|
246
|
+
downloader_config=DownloaderConfig,
|
|
247
|
+
connection_config=SlackConnectionConfig,
|
|
248
|
+
)
|