unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
7
|
+
DestinationRegistryEntry,
|
|
8
|
+
SourceRegistryEntry,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
25
|
+
account_id: Optional[str] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
description="The Databricks account ID for the Databricks " "accounts endpoint.",
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
azure_workspace_resource_id: Optional[str] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="The Azure Resource Manager ID for the Azure Databricks workspace, "
|
|
33
|
+
"which is exchanged for a Databricks host URL.",
|
|
34
|
+
)
|
|
35
|
+
azure_client_secret: Optional[str] = Field(
|
|
36
|
+
default=None, description="The Azure AD service principal’s client secret."
|
|
37
|
+
)
|
|
38
|
+
azure_client_id: Optional[str] = Field(
|
|
39
|
+
default=None, description="The Azure AD service principal’s application ID."
|
|
40
|
+
)
|
|
41
|
+
azure_tenant_id: Optional[str] = Field(
|
|
42
|
+
default=None, description="The Azure AD service principal’s tenant ID."
|
|
43
|
+
)
|
|
44
|
+
azure_environment: Optional[str] = Field(
|
|
45
|
+
default=None,
|
|
46
|
+
description="The Azure environment type for a " "specific set of API endpoints",
|
|
47
|
+
examples=["Public", "UsGov", "China", "Germany"],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DatabricksAzureVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
52
|
+
access_config: Secret[DatabricksAzureVolumesAccessConfig]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DatabricksAzureVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class DatabricksAzureVolumesIndexer(DatabricksVolumesIndexer):
|
|
61
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
62
|
+
index_config: DatabricksAzureVolumesIndexerConfig
|
|
63
|
+
connector_type: str = CONNECTOR_TYPE
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class DatabricksAzureVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class DatabricksAzureVolumesDownloader(DatabricksVolumesDownloader):
|
|
72
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
73
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
74
|
+
connector_type: str = CONNECTOR_TYPE
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class DatabricksAzureVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class DatabricksAzureVolumesUploader(DatabricksVolumesUploader):
|
|
83
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
84
|
+
upload_config: DatabricksAzureVolumesUploaderConfig = field(
|
|
85
|
+
default_factory=DatabricksAzureVolumesUploaderConfig
|
|
86
|
+
)
|
|
87
|
+
connector_type: str = CONNECTOR_TYPE
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
|
|
91
|
+
connection_config=DatabricksAzureVolumesConnectionConfig,
|
|
92
|
+
uploader=DatabricksAzureVolumesUploader,
|
|
93
|
+
uploader_config=DatabricksAzureVolumesUploaderConfig,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
databricks_azure_volumes_source_entry = SourceRegistryEntry(
|
|
97
|
+
connection_config=DatabricksAzureVolumesConnectionConfig,
|
|
98
|
+
indexer=DatabricksAzureVolumesIndexer,
|
|
99
|
+
indexer_config=DatabricksAzureVolumesIndexerConfig,
|
|
100
|
+
downloader=DatabricksAzureVolumesDownloader,
|
|
101
|
+
downloader_config=DatabricksAzureVolumesDownloaderConfig,
|
|
102
|
+
)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
7
|
+
DestinationRegistryEntry,
|
|
8
|
+
SourceRegistryEntry,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes_gcp"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
25
|
+
account_id: Optional[str] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
description="The Databricks account ID for the Databricks " "accounts endpoint.",
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
google_credentials: Optional[str] = None
|
|
31
|
+
google_service_account: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DatabricksGoogleVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
35
|
+
access_config: Secret[DatabricksGoogleVolumesAccessConfig]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DatabricksGoogleVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class DatabricksGoogleVolumesIndexer(DatabricksVolumesIndexer):
|
|
44
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
45
|
+
index_config: DatabricksGoogleVolumesIndexerConfig
|
|
46
|
+
connector_type: str = CONNECTOR_TYPE
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DatabricksGoogleVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class DatabricksGoogleVolumesDownloader(DatabricksVolumesDownloader):
|
|
55
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
56
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
57
|
+
connector_type: str = CONNECTOR_TYPE
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DatabricksGoogleVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class DatabricksGoogleVolumesUploader(DatabricksVolumesUploader):
|
|
66
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
67
|
+
upload_config: DatabricksGoogleVolumesUploaderConfig = field(
|
|
68
|
+
default_factory=DatabricksGoogleVolumesUploaderConfig
|
|
69
|
+
)
|
|
70
|
+
connector_type: str = CONNECTOR_TYPE
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
|
|
74
|
+
connection_config=DatabricksGoogleVolumesConnectionConfig,
|
|
75
|
+
uploader=DatabricksGoogleVolumesUploader,
|
|
76
|
+
uploader_config=DatabricksGoogleVolumesUploaderConfig,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
databricks_gcp_volumes_source_entry = SourceRegistryEntry(
|
|
80
|
+
connection_config=DatabricksGoogleVolumesConnectionConfig,
|
|
81
|
+
indexer=DatabricksGoogleVolumesIndexer,
|
|
82
|
+
indexer_config=DatabricksGoogleVolumesIndexerConfig,
|
|
83
|
+
downloader=DatabricksGoogleVolumesDownloader,
|
|
84
|
+
downloader_config=DatabricksGoogleVolumesDownloaderConfig,
|
|
85
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
7
|
+
DestinationRegistryEntry,
|
|
8
|
+
SourceRegistryEntry,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksNativeVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
25
|
+
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
26
|
+
client_secret: Optional[str] = Field(
|
|
27
|
+
default=None, description="Client Secret of the OAuth app."
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
azure_workspace_resource_id: Optional[str] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="The Azure Resource Manager ID for the Azure Databricks workspace, "
|
|
33
|
+
"which is exchanged for a Databricks host URL.",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DatabricksNativeVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
38
|
+
access_config: Secret[DatabricksNativeVolumesAccessConfig]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DatabricksNativeVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class DatabricksNativeVolumesIndexer(DatabricksVolumesIndexer):
|
|
47
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
48
|
+
index_config: DatabricksNativeVolumesIndexerConfig
|
|
49
|
+
connector_type: str = CONNECTOR_TYPE
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DatabricksNativeVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class DatabricksNativeVolumesDownloader(DatabricksVolumesDownloader):
|
|
58
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
59
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
60
|
+
connector_type: str = CONNECTOR_TYPE
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class DatabricksNativeVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class DatabricksNativeVolumesUploader(DatabricksVolumesUploader):
|
|
69
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
70
|
+
upload_config: DatabricksNativeVolumesUploaderConfig
|
|
71
|
+
connector_type: str = CONNECTOR_TYPE
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
databricks_native_volumes_destination_entry = DestinationRegistryEntry(
|
|
75
|
+
connection_config=DatabricksNativeVolumesConnectionConfig,
|
|
76
|
+
uploader=DatabricksNativeVolumesUploader,
|
|
77
|
+
uploader_config=DatabricksNativeVolumesUploaderConfig,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
databricks_native_volumes_source_entry = SourceRegistryEntry(
|
|
81
|
+
connection_config=DatabricksNativeVolumesConnectionConfig,
|
|
82
|
+
indexer=DatabricksNativeVolumesIndexer,
|
|
83
|
+
indexer_config=DatabricksNativeVolumesIndexerConfig,
|
|
84
|
+
downloader=DatabricksNativeVolumesDownloader,
|
|
85
|
+
downloader_config=DatabricksNativeVolumesDownloaderConfig,
|
|
86
|
+
)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import traceback
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from multiprocessing import Process, Queue
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from pydantic import Field, Secret
|
|
12
|
+
|
|
13
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
14
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
15
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
17
|
+
from unstructured_ingest.v2.interfaces import (
|
|
18
|
+
AccessConfig,
|
|
19
|
+
ConnectionConfig,
|
|
20
|
+
FileData,
|
|
21
|
+
Uploader,
|
|
22
|
+
UploaderConfig,
|
|
23
|
+
UploadStager,
|
|
24
|
+
UploadStagerConfig,
|
|
25
|
+
)
|
|
26
|
+
from unstructured_ingest.v2.logger import logger
|
|
27
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
28
|
+
|
|
29
|
+
CONNECTOR_TYPE = "delta_table"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
33
|
+
def write_deltalake_with_error_handling(queue, **kwargs):
|
|
34
|
+
from deltalake.writer import write_deltalake
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
write_deltalake(**kwargs)
|
|
38
|
+
except Exception:
|
|
39
|
+
queue.put(traceback.format_exc())
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DeltaTableAccessConfig(AccessConfig):
|
|
43
|
+
aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
|
|
44
|
+
aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class DeltaTableConnectionConfig(ConnectionConfig):
|
|
48
|
+
access_config: Secret[DeltaTableAccessConfig] = Field(
|
|
49
|
+
default=DeltaTableAccessConfig(), validate_default=True
|
|
50
|
+
)
|
|
51
|
+
aws_region: Optional[str] = Field(default=None, description="AWS Region")
|
|
52
|
+
table_uri: str = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
description=(
|
|
55
|
+
"Local path or path to the target folder in the S3 bucket, "
|
|
56
|
+
"formatted as s3://my-bucket/my-folder/"
|
|
57
|
+
),
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def update_storage_options(self, storage_options: dict) -> None:
|
|
61
|
+
secrets = self.access_config.get_secret_value()
|
|
62
|
+
if self.aws_region and secrets.aws_access_key_id and secrets.aws_secret_access_key:
|
|
63
|
+
storage_options["AWS_REGION"] = self.aws_region
|
|
64
|
+
storage_options["AWS_ACCESS_KEY_ID"] = secrets.aws_access_key_id
|
|
65
|
+
storage_options["AWS_SECRET_ACCESS_KEY"] = secrets.aws_secret_access_key
|
|
66
|
+
# Delta-rs doesn't support concurrent S3 writes without external locks (DynamoDB).
|
|
67
|
+
# This flag allows single-writer uploads to S3 without using locks, according to:
|
|
68
|
+
# https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/
|
|
69
|
+
storage_options["AWS_S3_ALLOW_UNSAFE_RENAME"] = "true"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class DeltaTableUploadStagerConfig(UploadStagerConfig):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class DeltaTableUploadStager(UploadStager):
|
|
78
|
+
upload_stager_config: DeltaTableUploadStagerConfig = field(
|
|
79
|
+
default_factory=lambda: DeltaTableUploadStagerConfig()
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def run(
|
|
83
|
+
self,
|
|
84
|
+
elements_filepath: Path,
|
|
85
|
+
output_dir: Path,
|
|
86
|
+
output_filename: str,
|
|
87
|
+
**kwargs: Any,
|
|
88
|
+
) -> Path:
|
|
89
|
+
with open(elements_filepath) as elements_file:
|
|
90
|
+
elements_contents = json.load(elements_file)
|
|
91
|
+
|
|
92
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
|
|
93
|
+
|
|
94
|
+
df = convert_to_pandas_dataframe(elements_dict=elements_contents)
|
|
95
|
+
df.to_parquet(output_path)
|
|
96
|
+
|
|
97
|
+
return output_path
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class DeltaTableUploaderConfig(UploaderConfig):
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class DeltaTableUploader(Uploader):
|
|
106
|
+
upload_config: DeltaTableUploaderConfig
|
|
107
|
+
connection_config: DeltaTableConnectionConfig
|
|
108
|
+
connector_type: str = CONNECTOR_TYPE
|
|
109
|
+
|
|
110
|
+
@requires_dependencies(["boto3"], extras="delta-table")
|
|
111
|
+
def precheck(self):
|
|
112
|
+
secrets = self.connection_config.access_config.get_secret_value()
|
|
113
|
+
if (
|
|
114
|
+
self.connection_config.aws_region
|
|
115
|
+
and secrets.aws_access_key_id
|
|
116
|
+
and secrets.aws_secret_access_key
|
|
117
|
+
):
|
|
118
|
+
from boto3 import client
|
|
119
|
+
|
|
120
|
+
url = urlparse(self.connection_config.table_uri)
|
|
121
|
+
bucket_name = url.netloc
|
|
122
|
+
dir_path = url.path.lstrip("/")
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
s3_client = client(
|
|
126
|
+
"s3",
|
|
127
|
+
aws_access_key_id=secrets.aws_access_key_id,
|
|
128
|
+
aws_secret_access_key=secrets.aws_secret_access_key,
|
|
129
|
+
)
|
|
130
|
+
s3_client.put_object(Bucket=bucket_name, Key=dir_path, Body=b"")
|
|
131
|
+
|
|
132
|
+
response = s3_client.get_bucket_location(Bucket=bucket_name)
|
|
133
|
+
|
|
134
|
+
if self.connection_config.aws_region != response.get("LocationConstraint"):
|
|
135
|
+
raise ValueError("Wrong AWS Region was provided.")
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
139
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
140
|
+
|
|
141
|
+
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
142
|
+
updated_upload_path = os.path.join(
|
|
143
|
+
self.connection_config.table_uri, file_data.source_identifiers.relative_path
|
|
144
|
+
)
|
|
145
|
+
logger.info(
|
|
146
|
+
f"writing {len(df)} rows to destination table "
|
|
147
|
+
f"at {updated_upload_path}\ndtypes: {df.dtypes}",
|
|
148
|
+
)
|
|
149
|
+
storage_options = {}
|
|
150
|
+
self.connection_config.update_storage_options(storage_options=storage_options)
|
|
151
|
+
|
|
152
|
+
writer_kwargs = {
|
|
153
|
+
"table_or_uri": updated_upload_path,
|
|
154
|
+
"data": df,
|
|
155
|
+
"mode": "overwrite",
|
|
156
|
+
"storage_options": storage_options,
|
|
157
|
+
}
|
|
158
|
+
queue = Queue()
|
|
159
|
+
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
|
160
|
+
# ingest to fail, even though all tasks are completed normally. Putting the writer into a
|
|
161
|
+
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's
|
|
162
|
+
# rust backend to finish
|
|
163
|
+
writer = Process(
|
|
164
|
+
target=write_deltalake_with_error_handling,
|
|
165
|
+
kwargs={"queue": queue, **writer_kwargs},
|
|
166
|
+
)
|
|
167
|
+
writer.start()
|
|
168
|
+
writer.join()
|
|
169
|
+
|
|
170
|
+
# Check if the queue has any exception message
|
|
171
|
+
if not queue.empty():
|
|
172
|
+
error_message = queue.get()
|
|
173
|
+
logger.error(f"Exception occurred in write_deltalake: {error_message}")
|
|
174
|
+
raise RuntimeError(f"Error in write_deltalake: {error_message}")
|
|
175
|
+
|
|
176
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
177
|
+
df = pd.DataFrame(data=data)
|
|
178
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
179
|
+
|
|
180
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
181
|
+
df = get_data_df(path)
|
|
182
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
delta_table_destination_entry = DestinationRegistryEntry(
|
|
186
|
+
connection_config=DeltaTableConnectionConfig,
|
|
187
|
+
uploader=DeltaTableUploader,
|
|
188
|
+
uploader_config=DeltaTableUploaderConfig,
|
|
189
|
+
upload_stager=DeltaTableUploadStager,
|
|
190
|
+
upload_stager_config=DeltaTableUploadStagerConfig,
|
|
191
|
+
)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, Secret
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.interfaces import (
|
|
10
|
+
AccessConfig,
|
|
11
|
+
ConnectionConfig,
|
|
12
|
+
Downloader,
|
|
13
|
+
DownloaderConfig,
|
|
14
|
+
DownloadResponse,
|
|
15
|
+
FileData,
|
|
16
|
+
FileDataSourceMetadata,
|
|
17
|
+
Indexer,
|
|
18
|
+
IndexerConfig,
|
|
19
|
+
SourceIdentifiers,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from discord import Client as DiscordClient
|
|
26
|
+
|
|
27
|
+
CONNECTOR_TYPE = "discord"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DiscordAccessConfig(AccessConfig):
|
|
31
|
+
token: str = Field(description="Discord API token")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DiscordConnectionConfig(ConnectionConfig):
|
|
35
|
+
access_config: Secret[DiscordAccessConfig] = Field(
|
|
36
|
+
default=DiscordAccessConfig, validate_default=True
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
@requires_dependencies(["discord"], extras="discord")
|
|
40
|
+
def get_client(self) -> "DiscordClient":
|
|
41
|
+
import discord
|
|
42
|
+
|
|
43
|
+
intents = discord.Intents.default()
|
|
44
|
+
intents.message_content = True
|
|
45
|
+
return discord.Client(intents=intents)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DiscordIndexerConfig(IndexerConfig):
|
|
49
|
+
channels: list[str] = Field(
|
|
50
|
+
default=None,
|
|
51
|
+
description="List of Discord channel IDs to process",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class DiscordIndexer(Indexer):
|
|
57
|
+
connection_config: DiscordConnectionConfig
|
|
58
|
+
index_config: DiscordIndexerConfig
|
|
59
|
+
|
|
60
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
61
|
+
self.connection_config.get_client()
|
|
62
|
+
channels_to_process: set[str] = set(self.index_config.channels or [])
|
|
63
|
+
|
|
64
|
+
for channel_id in list(channels_to_process):
|
|
65
|
+
file_data = self.get_channel_file_data(channel_id=channel_id)
|
|
66
|
+
if file_data:
|
|
67
|
+
yield file_data
|
|
68
|
+
|
|
69
|
+
def precheck(self) -> None:
|
|
70
|
+
if not self.connection_config.access_config.get_secret_value().token:
|
|
71
|
+
raise SourceConnectionError("Discord token is missing")
|
|
72
|
+
if not self.index_config.channels:
|
|
73
|
+
raise SourceConnectionError("No channels provided")
|
|
74
|
+
|
|
75
|
+
def get_channel_file_data(self, channel_id: str) -> Optional[FileData]:
|
|
76
|
+
# Fetch channel metadata
|
|
77
|
+
identifier = channel_id
|
|
78
|
+
channel_id = f"{channel_id}.txt"
|
|
79
|
+
source_identifiers = SourceIdentifiers(
|
|
80
|
+
filename=channel_id,
|
|
81
|
+
fullpath=channel_id,
|
|
82
|
+
)
|
|
83
|
+
metadata = FileDataSourceMetadata(
|
|
84
|
+
record_locator={"channel_id": identifier},
|
|
85
|
+
date_processed=str(dt.datetime.utcnow().isoformat()),
|
|
86
|
+
)
|
|
87
|
+
return FileData(
|
|
88
|
+
identifier=identifier,
|
|
89
|
+
connector_type=CONNECTOR_TYPE,
|
|
90
|
+
source_identifiers=source_identifiers,
|
|
91
|
+
metadata=metadata,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class DiscordDownloaderConfig(DownloaderConfig):
|
|
96
|
+
limit: Optional[int] = Field(
|
|
97
|
+
default=100, description="Limit on how many messages per channel to pull in"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class DiscordDownloader(Downloader):
|
|
103
|
+
connection_config: DiscordConnectionConfig
|
|
104
|
+
download_config: DiscordDownloaderConfig
|
|
105
|
+
connector_type: str = CONNECTOR_TYPE
|
|
106
|
+
|
|
107
|
+
def is_async(self) -> bool:
|
|
108
|
+
return True
|
|
109
|
+
|
|
110
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
111
|
+
# Synchronous run is not implemented
|
|
112
|
+
raise NotImplementedError()
|
|
113
|
+
|
|
114
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
115
|
+
record_locator = file_data.metadata.record_locator
|
|
116
|
+
|
|
117
|
+
if "channel_id" not in record_locator:
|
|
118
|
+
raise ValueError(f"No channel id in file data record locator: {record_locator}")
|
|
119
|
+
|
|
120
|
+
client = self.connection_config.get_client()
|
|
121
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
122
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
123
|
+
|
|
124
|
+
messages = []
|
|
125
|
+
channel_id = record_locator["channel_id"]
|
|
126
|
+
|
|
127
|
+
@client.event
|
|
128
|
+
async def on_ready():
|
|
129
|
+
logger.debug("Discord Bot is ready")
|
|
130
|
+
channel = client.get_channel(int(channel_id))
|
|
131
|
+
if not channel:
|
|
132
|
+
raise ValueError(f"channel not found for id: {channel_id}")
|
|
133
|
+
logger.debug(f"Processing messages for channel: {channel.name}")
|
|
134
|
+
async for msg in channel.history(limit=self.download_config.limit):
|
|
135
|
+
messages.append(msg)
|
|
136
|
+
logger.debug(f"Fetched {len(messages)} messages")
|
|
137
|
+
await client.close()
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
await client.start(self.connection_config.access_config.get_secret_value().token)
|
|
141
|
+
finally:
|
|
142
|
+
await client.close()
|
|
143
|
+
|
|
144
|
+
content = "\n".join([message.content for message in messages])
|
|
145
|
+
|
|
146
|
+
with open(download_path, "w") as file:
|
|
147
|
+
file.write(content)
|
|
148
|
+
|
|
149
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
discord_source_entry = SourceRegistryEntry(
|
|
153
|
+
indexer=DiscordIndexer,
|
|
154
|
+
indexer_config=DiscordIndexerConfig,
|
|
155
|
+
downloader=DiscordDownloader,
|
|
156
|
+
downloader_config=DiscordDownloaderConfig,
|
|
157
|
+
connection_config=DiscordConnectionConfig,
|
|
158
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from .duckdb import CONNECTOR_TYPE as DUCKDB_CONNECTOR_TYPE
|
|
8
|
+
from .duckdb import duckdb_destination_entry
|
|
9
|
+
from .motherduck import CONNECTOR_TYPE as MOTHERDUCK_CONNECTOR_TYPE
|
|
10
|
+
from .motherduck import motherduck_destination_entry
|
|
11
|
+
|
|
12
|
+
add_destination_entry(destination_type=DUCKDB_CONNECTOR_TYPE, entry=duckdb_destination_entry)
|
|
13
|
+
add_destination_entry(
|
|
14
|
+
destination_type=MOTHERDUCK_CONNECTOR_TYPE, entry=motherduck_destination_entry
|
|
15
|
+
)
|