unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +1 -1
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
- unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,180 +0,0 @@
|
|
|
1
|
-
import datetime as dt
|
|
2
|
-
import typing as t
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
7
|
-
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
8
|
-
from unstructured_ingest.interfaces import (
|
|
9
|
-
AccessConfig,
|
|
10
|
-
BaseConnectorConfig,
|
|
11
|
-
BaseSingleIngestDoc,
|
|
12
|
-
BaseSourceConnector,
|
|
13
|
-
IngestDocCleanupMixin,
|
|
14
|
-
SourceConnectorCleanupMixin,
|
|
15
|
-
SourceMetadata,
|
|
16
|
-
)
|
|
17
|
-
from unstructured_ingest.logger import logger
|
|
18
|
-
from unstructured_ingest.utils.dep_check import (
|
|
19
|
-
requires_dependencies,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@dataclass
|
|
24
|
-
class DiscordAccessConfig(AccessConfig):
|
|
25
|
-
token: str = enhanced_field(sensitive=True)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@dataclass
|
|
29
|
-
class SimpleDiscordConfig(BaseConnectorConfig):
|
|
30
|
-
"""Connector config where channels is a comma separated list of
|
|
31
|
-
Discord channels to pull messages from.
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
# Discord Specific Options
|
|
35
|
-
access_config: DiscordAccessConfig
|
|
36
|
-
channels: t.List[str]
|
|
37
|
-
period: t.Optional[int] = None
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@dataclass
|
|
41
|
-
class DiscordIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
42
|
-
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
43
|
-
doing the processing!).
|
|
44
|
-
Also includes a cleanup method. When things go wrong and the cleanup
|
|
45
|
-
method is not called, the file is left behind on the filesystem to assist debugging.
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
connector_config: SimpleDiscordConfig
|
|
49
|
-
channel: str
|
|
50
|
-
days: t.Optional[int] = None
|
|
51
|
-
registry_name: str = "discord"
|
|
52
|
-
|
|
53
|
-
# NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file
|
|
54
|
-
# __post_init__ for multiprocessing simplicity (no Path objects in initially
|
|
55
|
-
# instantiated object)
|
|
56
|
-
def _tmp_download_file(self):
|
|
57
|
-
channel_file = self.channel + ".txt"
|
|
58
|
-
return Path(self.read_config.download_dir) / channel_file
|
|
59
|
-
|
|
60
|
-
@property
|
|
61
|
-
def _output_filename(self):
|
|
62
|
-
output_file = self.channel + ".json"
|
|
63
|
-
return Path(self.processor_config.output_dir) / output_file
|
|
64
|
-
|
|
65
|
-
def _create_full_tmp_dir_path(self):
|
|
66
|
-
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
|
67
|
-
|
|
68
|
-
@SourceConnectionNetworkError.wrap
|
|
69
|
-
@requires_dependencies(dependencies=["discord"], extras="discord")
|
|
70
|
-
def _get_messages(self):
|
|
71
|
-
"""Actually fetches the data from discord."""
|
|
72
|
-
import discord
|
|
73
|
-
from discord.ext import commands
|
|
74
|
-
|
|
75
|
-
messages: t.List[discord.Message] = []
|
|
76
|
-
jumpurl: t.List[str] = []
|
|
77
|
-
intents = discord.Intents.default()
|
|
78
|
-
intents.message_content = True
|
|
79
|
-
bot = commands.Bot(command_prefix=">", intents=intents)
|
|
80
|
-
|
|
81
|
-
@bot.event
|
|
82
|
-
async def on_ready():
|
|
83
|
-
try:
|
|
84
|
-
after_date = None
|
|
85
|
-
if self.days:
|
|
86
|
-
after_date = dt.datetime.utcnow() - dt.timedelta(days=self.days)
|
|
87
|
-
channel = bot.get_channel(int(self.channel))
|
|
88
|
-
jumpurl.append(channel.jump_url) # type: ignore
|
|
89
|
-
async for msg in channel.history(after=after_date): # type: ignore
|
|
90
|
-
messages.append(msg)
|
|
91
|
-
await bot.close()
|
|
92
|
-
except Exception:
|
|
93
|
-
logger.error("Error fetching messages")
|
|
94
|
-
await bot.close()
|
|
95
|
-
raise
|
|
96
|
-
|
|
97
|
-
bot.run(self.connector_config.access_config.token)
|
|
98
|
-
jump_url = None if len(jumpurl) < 1 else jumpurl[0]
|
|
99
|
-
return messages, jump_url
|
|
100
|
-
|
|
101
|
-
def update_source_metadata(self, **kwargs):
|
|
102
|
-
messages, jump_url = kwargs.get("messages_tuple", self._get_messages())
|
|
103
|
-
if messages == []:
|
|
104
|
-
self.source_metadata = SourceMetadata(
|
|
105
|
-
exists=False,
|
|
106
|
-
)
|
|
107
|
-
return
|
|
108
|
-
dates = [m.created_at for m in messages if m.created_at]
|
|
109
|
-
dates.sort()
|
|
110
|
-
self.source_metadata = SourceMetadata(
|
|
111
|
-
date_created=dates[0].isoformat(),
|
|
112
|
-
date_modified=dates[-1].isoformat(),
|
|
113
|
-
source_url=jump_url,
|
|
114
|
-
exists=True,
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
@SourceConnectionError.wrap
|
|
118
|
-
@BaseSingleIngestDoc.skip_if_file_exists
|
|
119
|
-
def get_file(self):
|
|
120
|
-
self._create_full_tmp_dir_path()
|
|
121
|
-
|
|
122
|
-
messages, jump_url = self._get_messages()
|
|
123
|
-
self.update_source_metadata(messages_tuple=(messages, jump_url))
|
|
124
|
-
if messages == []:
|
|
125
|
-
raise ValueError(f"Failed to retrieve messages from Discord channel {self.channel}")
|
|
126
|
-
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
|
127
|
-
with open(self._tmp_download_file(), "w") as f:
|
|
128
|
-
for m in messages:
|
|
129
|
-
f.write(m.content + "\n")
|
|
130
|
-
|
|
131
|
-
@property
|
|
132
|
-
def filename(self):
|
|
133
|
-
"""The filename of the file created from a discord channel"""
|
|
134
|
-
return self._tmp_download_file()
|
|
135
|
-
|
|
136
|
-
@property
|
|
137
|
-
def version(self) -> t.Optional[str]:
|
|
138
|
-
return None
|
|
139
|
-
|
|
140
|
-
@property
|
|
141
|
-
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
142
|
-
return {
|
|
143
|
-
"channel": self.channel,
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
class DiscordSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
148
|
-
"""Objects of this class support fetching document(s) from"""
|
|
149
|
-
|
|
150
|
-
connector_config: SimpleDiscordConfig
|
|
151
|
-
|
|
152
|
-
def initialize(self):
|
|
153
|
-
pass
|
|
154
|
-
|
|
155
|
-
@requires_dependencies(dependencies=["discord"], extras="discord")
|
|
156
|
-
def check_connection(self):
|
|
157
|
-
import asyncio
|
|
158
|
-
|
|
159
|
-
import discord
|
|
160
|
-
from discord.client import Client
|
|
161
|
-
|
|
162
|
-
intents = discord.Intents.default()
|
|
163
|
-
try:
|
|
164
|
-
client = Client(intents=intents)
|
|
165
|
-
asyncio.run(client.start(token=self.connector_config.access_config.token))
|
|
166
|
-
except Exception as e:
|
|
167
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
168
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
169
|
-
|
|
170
|
-
def get_ingest_docs(self):
|
|
171
|
-
return [
|
|
172
|
-
DiscordIngestDoc(
|
|
173
|
-
connector_config=self.connector_config,
|
|
174
|
-
processor_config=self.processor_config,
|
|
175
|
-
read_config=self.read_config,
|
|
176
|
-
channel=channel,
|
|
177
|
-
days=self.connector_config.period,
|
|
178
|
-
)
|
|
179
|
-
for channel in self.connector_config.channels
|
|
180
|
-
]
|
|
@@ -1,396 +0,0 @@
|
|
|
1
|
-
import copy
|
|
2
|
-
import hashlib
|
|
3
|
-
import typing as t
|
|
4
|
-
import uuid
|
|
5
|
-
from dataclasses import dataclass, field
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
from dataclasses_json.core import Json
|
|
9
|
-
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
|
-
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
12
|
-
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
13
|
-
from unstructured_ingest.interfaces import (
|
|
14
|
-
AccessConfig,
|
|
15
|
-
BaseConnectorConfig,
|
|
16
|
-
BaseDestinationConnector,
|
|
17
|
-
BaseIngestDocBatch,
|
|
18
|
-
BaseSingleIngestDoc,
|
|
19
|
-
BaseSourceConnector,
|
|
20
|
-
IngestDocCleanupMixin,
|
|
21
|
-
SourceConnectorCleanupMixin,
|
|
22
|
-
SourceMetadata,
|
|
23
|
-
WriteConfig,
|
|
24
|
-
)
|
|
25
|
-
from unstructured_ingest.logger import logger
|
|
26
|
-
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
27
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
28
|
-
|
|
29
|
-
if t.TYPE_CHECKING:
|
|
30
|
-
from elasticsearch import Elasticsearch
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@dataclass
|
|
34
|
-
class ElasticsearchAccessConfig(AccessConfig):
|
|
35
|
-
hosts: t.Optional[t.List[str]] = None
|
|
36
|
-
username: t.Optional[str] = None
|
|
37
|
-
password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
38
|
-
cloud_id: t.Optional[str] = None
|
|
39
|
-
api_key: t.Optional[str] = enhanced_field(
|
|
40
|
-
default=None, sensitive=True, overload_name="es_api_key"
|
|
41
|
-
)
|
|
42
|
-
api_key_id: t.Optional[str] = None
|
|
43
|
-
bearer_auth: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
44
|
-
ca_certs: t.Optional[str] = None
|
|
45
|
-
ssl_assert_fingerprint: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
46
|
-
|
|
47
|
-
def to_dict(self, **kwargs) -> t.Dict[str, Json]:
|
|
48
|
-
d = super().to_dict(**kwargs)
|
|
49
|
-
# Update auth related fields to conform to what the SDK expects based on the
|
|
50
|
-
# supported methods:
|
|
51
|
-
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
|
|
52
|
-
if not self.ca_certs:
|
|
53
|
-
# ES library already sets a default for this, don't want to
|
|
54
|
-
# introduce data by setting it to None
|
|
55
|
-
d.pop("ca_certs")
|
|
56
|
-
if self.password and (self.cloud_id or self.ca_certs or self.ssl_assert_fingerprint):
|
|
57
|
-
d.pop("password")
|
|
58
|
-
d["basic_auth"] = ("elastic", self.password)
|
|
59
|
-
elif not self.cloud_id and self.username and self.password:
|
|
60
|
-
d.pop("username", None)
|
|
61
|
-
d.pop("password", None)
|
|
62
|
-
d["basic_auth"] = (self.username, self.password)
|
|
63
|
-
elif self.api_key and self.api_key_id:
|
|
64
|
-
d.pop("api_key_id", None)
|
|
65
|
-
d.pop("api_key", None)
|
|
66
|
-
d["api_key"] = (self.api_key_id, self.api_key)
|
|
67
|
-
# This doesn't exist on the client init, remove:
|
|
68
|
-
d.pop("api_key_id", None)
|
|
69
|
-
return d
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
@dataclass
|
|
73
|
-
class SimpleElasticsearchConfig(BaseConnectorConfig):
|
|
74
|
-
"""Connector config where:
|
|
75
|
-
url is the url to access the elasticsearch server,
|
|
76
|
-
index_name is the name of the index to reach to,
|
|
77
|
-
"""
|
|
78
|
-
|
|
79
|
-
index_name: str
|
|
80
|
-
batch_size: int = 100
|
|
81
|
-
fields: t.List[str] = field(default_factory=list)
|
|
82
|
-
access_config: ElasticsearchAccessConfig = None
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
@dataclass
|
|
86
|
-
class ElasticsearchDocumentMeta:
|
|
87
|
-
"""Metadata specifying:
|
|
88
|
-
name of the elasticsearch index that is being reached to,
|
|
89
|
-
and the id of document that is being reached to,
|
|
90
|
-
"""
|
|
91
|
-
|
|
92
|
-
index_name: str
|
|
93
|
-
document_id: str
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
@dataclass
|
|
97
|
-
class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
98
|
-
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
99
|
-
doing the processing!).
|
|
100
|
-
|
|
101
|
-
Current implementation creates a python Elasticsearch client to fetch each doc,
|
|
102
|
-
rather than creating a client for each thread.
|
|
103
|
-
"""
|
|
104
|
-
|
|
105
|
-
connector_config: SimpleElasticsearchConfig
|
|
106
|
-
document_meta: ElasticsearchDocumentMeta
|
|
107
|
-
document: dict = field(default_factory=dict)
|
|
108
|
-
registry_name: str = "elasticsearch"
|
|
109
|
-
|
|
110
|
-
# TODO: remove one of filename or _tmp_download_file, using a wrapper
|
|
111
|
-
@property
|
|
112
|
-
def filename(self):
|
|
113
|
-
f = self.document_meta.document_id
|
|
114
|
-
if self.connector_config.fields:
|
|
115
|
-
f = "{}-{}".format(
|
|
116
|
-
f,
|
|
117
|
-
hashlib.sha256(",".join(self.connector_config.fields).encode()).hexdigest()[:8],
|
|
118
|
-
)
|
|
119
|
-
return (
|
|
120
|
-
Path(self.read_config.download_dir) / self.document_meta.index_name / f"{f}.txt"
|
|
121
|
-
).resolve()
|
|
122
|
-
|
|
123
|
-
@property
|
|
124
|
-
def _output_filename(self):
|
|
125
|
-
"""Create filename document id combined with a hash of the query to uniquely identify
|
|
126
|
-
the output file."""
|
|
127
|
-
# Generate SHA256 hash and take the first 8 characters
|
|
128
|
-
filename = self.document_meta.document_id
|
|
129
|
-
if self.connector_config.fields:
|
|
130
|
-
filename = "{}-{}".format(
|
|
131
|
-
filename,
|
|
132
|
-
hashlib.sha256(",".join(self.connector_config.fields).encode()).hexdigest()[:8],
|
|
133
|
-
)
|
|
134
|
-
output_file = f"{filename}.json"
|
|
135
|
-
return (
|
|
136
|
-
Path(self.processor_config.output_dir) / self.connector_config.index_name / output_file
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
def update_source_metadata(self, **kwargs):
|
|
140
|
-
if self.document is None:
|
|
141
|
-
self.source_metadata = SourceMetadata(
|
|
142
|
-
exists=False,
|
|
143
|
-
)
|
|
144
|
-
return
|
|
145
|
-
self.source_metadata = SourceMetadata(
|
|
146
|
-
version=self.document["_version"],
|
|
147
|
-
exists=True,
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
@SourceConnectionError.wrap
|
|
151
|
-
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
152
|
-
@BaseSingleIngestDoc.skip_if_file_exists
|
|
153
|
-
def get_file(self):
|
|
154
|
-
pass
|
|
155
|
-
|
|
156
|
-
@property
|
|
157
|
-
def date_created(self) -> t.Optional[str]:
|
|
158
|
-
return None
|
|
159
|
-
|
|
160
|
-
@property
|
|
161
|
-
def date_modified(self) -> t.Optional[str]:
|
|
162
|
-
return None
|
|
163
|
-
|
|
164
|
-
@property
|
|
165
|
-
def source_url(self) -> t.Optional[str]:
|
|
166
|
-
return None
|
|
167
|
-
|
|
168
|
-
@property
|
|
169
|
-
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
170
|
-
return {
|
|
171
|
-
"hosts": self.connector_config.access_config.hosts,
|
|
172
|
-
"index_name": self.connector_config.index_name,
|
|
173
|
-
"document_id": self.document_meta.document_id,
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
@dataclass
|
|
178
|
-
class ElasticsearchIngestDocBatch(BaseIngestDocBatch):
|
|
179
|
-
connector_config: SimpleElasticsearchConfig
|
|
180
|
-
ingest_docs: t.List[ElasticsearchIngestDoc] = field(default_factory=list)
|
|
181
|
-
list_of_ids: t.List[str] = field(default_factory=list)
|
|
182
|
-
registry_name: str = "elasticsearch_batch"
|
|
183
|
-
|
|
184
|
-
def __post_init__(self):
|
|
185
|
-
# Until python3.8 is deprecated, this is a limitation of dataclass inheritance
|
|
186
|
-
# to make it a required field
|
|
187
|
-
if len(self.list_of_ids) == 0:
|
|
188
|
-
raise ValueError("list_of_ids is required")
|
|
189
|
-
|
|
190
|
-
@property
|
|
191
|
-
def unique_id(self) -> str:
|
|
192
|
-
return ",".join(sorted(self.list_of_ids))
|
|
193
|
-
|
|
194
|
-
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
195
|
-
def _get_docs(self):
|
|
196
|
-
from elasticsearch import Elasticsearch
|
|
197
|
-
from elasticsearch.helpers import scan
|
|
198
|
-
|
|
199
|
-
es = Elasticsearch(**self.connector_config.access_config.to_dict(apply_name_overload=False))
|
|
200
|
-
scan_query = {
|
|
201
|
-
"_source": self.connector_config.fields,
|
|
202
|
-
"version": True,
|
|
203
|
-
"query": {"ids": {"values": self.list_of_ids}},
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
result = scan(
|
|
207
|
-
es,
|
|
208
|
-
query=scan_query,
|
|
209
|
-
scroll="1m",
|
|
210
|
-
index=self.connector_config.index_name,
|
|
211
|
-
)
|
|
212
|
-
return list(result)
|
|
213
|
-
|
|
214
|
-
@SourceConnectionError.wrap
|
|
215
|
-
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
216
|
-
def get_files(self):
|
|
217
|
-
documents = self._get_docs()
|
|
218
|
-
for doc in documents:
|
|
219
|
-
ingest_doc = ElasticsearchIngestDoc(
|
|
220
|
-
processor_config=self.processor_config,
|
|
221
|
-
read_config=self.read_config,
|
|
222
|
-
connector_config=self.connector_config,
|
|
223
|
-
document=doc,
|
|
224
|
-
document_meta=ElasticsearchDocumentMeta(
|
|
225
|
-
self.connector_config.index_name, doc["_id"]
|
|
226
|
-
),
|
|
227
|
-
)
|
|
228
|
-
ingest_doc.update_source_metadata()
|
|
229
|
-
doc_body = doc["_source"]
|
|
230
|
-
filename = ingest_doc.filename
|
|
231
|
-
flattened_dict = flatten_dict(dictionary=doc_body)
|
|
232
|
-
str_values = [str(value) for value in flattened_dict.values()]
|
|
233
|
-
concatenated_values = "\n".join(str_values)
|
|
234
|
-
|
|
235
|
-
filename.parent.mkdir(parents=True, exist_ok=True)
|
|
236
|
-
with open(filename, "w", encoding="utf8") as f:
|
|
237
|
-
f.write(concatenated_values)
|
|
238
|
-
self.ingest_docs.append(ingest_doc)
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
@dataclass
|
|
242
|
-
class ElasticsearchSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
243
|
-
"""Fetches particular fields from all documents in a given elasticsearch cluster and index"""
|
|
244
|
-
|
|
245
|
-
connector_config: SimpleElasticsearchConfig
|
|
246
|
-
_es: t.Optional["Elasticsearch"] = field(init=False, default=None)
|
|
247
|
-
|
|
248
|
-
@property
|
|
249
|
-
def es(self):
|
|
250
|
-
from elasticsearch import Elasticsearch
|
|
251
|
-
|
|
252
|
-
if self._es is None:
|
|
253
|
-
self._es = Elasticsearch(
|
|
254
|
-
**self.connector_config.access_config.to_dict(apply_name_overload=False)
|
|
255
|
-
)
|
|
256
|
-
return self._es
|
|
257
|
-
|
|
258
|
-
def check_connection(self):
|
|
259
|
-
try:
|
|
260
|
-
self.es.perform_request("HEAD", "/", headers={"accept": "application/json"})
|
|
261
|
-
except Exception as e:
|
|
262
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
263
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
264
|
-
|
|
265
|
-
def __post_init__(self):
|
|
266
|
-
self.scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
|
|
267
|
-
|
|
268
|
-
def initialize(self):
|
|
269
|
-
pass
|
|
270
|
-
|
|
271
|
-
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
272
|
-
def _get_doc_ids(self):
|
|
273
|
-
"""Fetches all document ids in an index"""
|
|
274
|
-
from elasticsearch.helpers import scan
|
|
275
|
-
|
|
276
|
-
hits = scan(
|
|
277
|
-
self.es,
|
|
278
|
-
query=self.scan_query,
|
|
279
|
-
scroll="1m",
|
|
280
|
-
index=self.connector_config.index_name,
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
return [hit["_id"] for hit in hits]
|
|
284
|
-
|
|
285
|
-
def get_ingest_docs(self):
|
|
286
|
-
"""Fetches all documents in an index, using ids that are fetched with _get_doc_ids"""
|
|
287
|
-
ids = self._get_doc_ids()
|
|
288
|
-
id_batches = [
|
|
289
|
-
ids[
|
|
290
|
-
i
|
|
291
|
-
* self.connector_config.batch_size : (i + 1) # noqa
|
|
292
|
-
* self.connector_config.batch_size
|
|
293
|
-
]
|
|
294
|
-
for i in range(
|
|
295
|
-
(len(ids) + self.connector_config.batch_size - 1)
|
|
296
|
-
// self.connector_config.batch_size
|
|
297
|
-
)
|
|
298
|
-
]
|
|
299
|
-
return [
|
|
300
|
-
ElasticsearchIngestDocBatch(
|
|
301
|
-
connector_config=self.connector_config,
|
|
302
|
-
processor_config=self.processor_config,
|
|
303
|
-
read_config=self.read_config,
|
|
304
|
-
list_of_ids=batched_ids,
|
|
305
|
-
)
|
|
306
|
-
for batched_ids in id_batches
|
|
307
|
-
]
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
@dataclass
|
|
311
|
-
class ElasticsearchWriteConfig(WriteConfig):
|
|
312
|
-
batch_size_bytes: int = 15_000_000
|
|
313
|
-
num_processes: int = 1
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
@dataclass
|
|
317
|
-
class ElasticsearchDestinationConnector(BaseDestinationConnector):
|
|
318
|
-
write_config: ElasticsearchWriteConfig
|
|
319
|
-
connector_config: SimpleElasticsearchConfig
|
|
320
|
-
_client: t.Optional["Elasticsearch"] = field(init=False, default=None)
|
|
321
|
-
|
|
322
|
-
def to_dict(self, **kwargs):
|
|
323
|
-
"""
|
|
324
|
-
The _client variable in this dataclass breaks deepcopy due to:
|
|
325
|
-
TypeError: cannot pickle '_thread.lock' object
|
|
326
|
-
When serializing, remove it, meaning client data will need to be reinitialized
|
|
327
|
-
when deserialized
|
|
328
|
-
"""
|
|
329
|
-
self_cp = copy.copy(self)
|
|
330
|
-
if hasattr(self_cp, "_client"):
|
|
331
|
-
setattr(self_cp, "_client", None)
|
|
332
|
-
return _asdict(self_cp, **kwargs)
|
|
333
|
-
|
|
334
|
-
@DestinationConnectionError.wrap
|
|
335
|
-
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
336
|
-
def generate_client(self) -> "Elasticsearch":
|
|
337
|
-
from elasticsearch import Elasticsearch
|
|
338
|
-
|
|
339
|
-
return Elasticsearch(
|
|
340
|
-
**self.connector_config.access_config.to_dict(apply_name_overload=False)
|
|
341
|
-
)
|
|
342
|
-
|
|
343
|
-
@property
|
|
344
|
-
def client(self):
|
|
345
|
-
if self._client is None:
|
|
346
|
-
self._client = self.generate_client()
|
|
347
|
-
return self._client
|
|
348
|
-
|
|
349
|
-
def initialize(self):
|
|
350
|
-
_ = self.client
|
|
351
|
-
|
|
352
|
-
@DestinationConnectionError.wrap
|
|
353
|
-
def check_connection(self):
|
|
354
|
-
try:
|
|
355
|
-
assert self.client.ping()
|
|
356
|
-
except Exception as e:
|
|
357
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
358
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
359
|
-
|
|
360
|
-
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
361
|
-
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
362
|
-
logger.info(
|
|
363
|
-
f"writing document batches to destination"
|
|
364
|
-
f" index named {self.connector_config.index_name}"
|
|
365
|
-
f" at {self.connector_config.access_config.hosts}"
|
|
366
|
-
f" with batch size (in bytes) {self.write_config.batch_size_bytes}"
|
|
367
|
-
f" with {self.write_config.num_processes} (number of) processes"
|
|
368
|
-
)
|
|
369
|
-
from elasticsearch.helpers import parallel_bulk
|
|
370
|
-
|
|
371
|
-
for batch in generator_batching_wbytes(
|
|
372
|
-
elements_dict, batch_size_limit_bytes=self.write_config.batch_size_bytes
|
|
373
|
-
):
|
|
374
|
-
for success, info in parallel_bulk(
|
|
375
|
-
self.client, batch, thread_count=self.write_config.num_processes
|
|
376
|
-
):
|
|
377
|
-
if not success:
|
|
378
|
-
logger.error(
|
|
379
|
-
"upload failed for a batch in elasticsearch destination connector:", info
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
def normalize_dict(self, element_dict: dict) -> dict:
|
|
383
|
-
return {
|
|
384
|
-
"_index": self.connector_config.index_name,
|
|
385
|
-
"_id": str(uuid.uuid4()),
|
|
386
|
-
"_source": {
|
|
387
|
-
"element_id": element_dict.pop("element_id", None),
|
|
388
|
-
"embeddings": element_dict.pop("embeddings", None),
|
|
389
|
-
"text": element_dict.pop("text", None),
|
|
390
|
-
"type": element_dict.pop("type", None),
|
|
391
|
-
"metadata": flatten_dict(
|
|
392
|
-
element_dict.pop("metadata", None),
|
|
393
|
-
separator="-",
|
|
394
|
-
),
|
|
395
|
-
},
|
|
396
|
-
}
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.connector.fsspec.fsspec import (
|
|
5
|
-
FsspecDestinationConnector,
|
|
6
|
-
FsspecIngestDoc,
|
|
7
|
-
FsspecSourceConnector,
|
|
8
|
-
FsspecWriteConfig,
|
|
9
|
-
SimpleFsspecConfig,
|
|
10
|
-
WriteTextConfig,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
13
|
-
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
14
|
-
from unstructured_ingest.interfaces import AccessConfig
|
|
15
|
-
from unstructured_ingest.logger import logger
|
|
16
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class AzureWriteTextConfig(WriteTextConfig):
|
|
21
|
-
overwrite: bool = False
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@dataclass
|
|
25
|
-
class AzureWriteConfig(FsspecWriteConfig):
|
|
26
|
-
write_text_config: t.Optional[AzureWriteTextConfig] = None
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@dataclass
|
|
30
|
-
class AzureAccessConfig(AccessConfig):
|
|
31
|
-
account_name: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
32
|
-
account_key: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
33
|
-
connection_string: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
34
|
-
sas_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
@dataclass
|
|
38
|
-
class SimpleAzureBlobStorageConfig(SimpleFsspecConfig):
|
|
39
|
-
access_config: AzureAccessConfig = None
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@dataclass
|
|
43
|
-
class AzureBlobStorageIngestDoc(FsspecIngestDoc):
|
|
44
|
-
connector_config: SimpleAzureBlobStorageConfig
|
|
45
|
-
registry_name: str = "azure"
|
|
46
|
-
|
|
47
|
-
@SourceConnectionError.wrap
|
|
48
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
49
|
-
def get_file(self):
|
|
50
|
-
super().get_file()
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@dataclass
|
|
54
|
-
class AzureBlobStorageSourceConnector(FsspecSourceConnector):
|
|
55
|
-
connector_config: SimpleAzureBlobStorageConfig
|
|
56
|
-
|
|
57
|
-
def __post_init__(self):
|
|
58
|
-
self.ingest_doc_cls: t.Type[AzureBlobStorageIngestDoc] = AzureBlobStorageIngestDoc
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
@dataclass
|
|
62
|
-
class AzureBlobStorageDestinationConnector(FsspecDestinationConnector):
|
|
63
|
-
connector_config: SimpleAzureBlobStorageConfig
|
|
64
|
-
write_config: AzureWriteConfig
|
|
65
|
-
|
|
66
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
67
|
-
def initialize(self):
|
|
68
|
-
super().initialize()
|
|
69
|
-
|
|
70
|
-
@requires_dependencies(["adlfs"], extras="azure")
|
|
71
|
-
def check_connection(self):
|
|
72
|
-
from adlfs import AzureBlobFileSystem
|
|
73
|
-
|
|
74
|
-
try:
|
|
75
|
-
AzureBlobFileSystem(**self.connector_config.get_access_config())
|
|
76
|
-
except ValueError as connection_error:
|
|
77
|
-
logger.error(f"failed to validate connection: {connection_error}", exc_info=True)
|
|
78
|
-
raise DestinationConnectionError(f"failed to validate connection: {connection_error}")
|