unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
import xml.etree.ElementTree as ET
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
9
|
+
from unstructured_ingest.interfaces import (
|
|
10
|
+
AccessConfig,
|
|
11
|
+
BaseConnectorConfig,
|
|
12
|
+
BaseSingleIngestDoc,
|
|
13
|
+
BaseSourceConnector,
|
|
14
|
+
IngestDocCleanupMixin,
|
|
15
|
+
SourceConnectorCleanupMixin,
|
|
16
|
+
SourceMetadata,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.logger import logger
|
|
19
|
+
from unstructured_ingest.utils.data_prep import validate_date_args
|
|
20
|
+
from unstructured_ingest.utils.dep_check import (
|
|
21
|
+
requires_dependencies,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class SlackAccessConfig(AccessConfig):
|
|
29
|
+
token: str = enhanced_field(sensitive=True)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class SimpleSlackConfig(BaseConnectorConfig):
|
|
34
|
+
"""Connector config to process all messages by channel id's."""
|
|
35
|
+
|
|
36
|
+
access_config: SlackAccessConfig
|
|
37
|
+
channels: t.List[str]
|
|
38
|
+
start_date: t.Optional[str] = None
|
|
39
|
+
end_date: t.Optional[str] = None
|
|
40
|
+
|
|
41
|
+
def validate_inputs(self):
|
|
42
|
+
oldest_valid = True
|
|
43
|
+
latest_valid = True
|
|
44
|
+
|
|
45
|
+
if self.start_date:
|
|
46
|
+
oldest_valid = validate_date_args(self.start_date)
|
|
47
|
+
|
|
48
|
+
if self.end_date:
|
|
49
|
+
latest_valid = validate_date_args(self.end_date)
|
|
50
|
+
|
|
51
|
+
return oldest_valid, latest_valid
|
|
52
|
+
|
|
53
|
+
def __post_init__(self):
|
|
54
|
+
oldest_valid, latest_valid = self.validate_inputs()
|
|
55
|
+
if not oldest_valid and not latest_valid:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
"Start and/or End dates are not valid. ",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class SlackIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
63
|
+
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
64
|
+
doing the processing!).
|
|
65
|
+
|
|
66
|
+
Also includes a cleanup method. When things go wrong and the cleanup
|
|
67
|
+
method is not called, the file is left behind on the filesystem to assist debugging.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
connector_config: SimpleSlackConfig
|
|
71
|
+
channel: str
|
|
72
|
+
registry_name: str = "slack"
|
|
73
|
+
|
|
74
|
+
# NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file
|
|
75
|
+
# __post_init__ for multiprocessing simplicity (no Path objects in initially
|
|
76
|
+
# instantiated object)
|
|
77
|
+
def _tmp_download_file(self):
|
|
78
|
+
channel_file = self.channel + ".xml"
|
|
79
|
+
return Path(self.read_config.download_dir) / channel_file
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def _output_filename(self):
|
|
83
|
+
output_file = self.channel + ".json"
|
|
84
|
+
return Path(self.processor_config.output_dir) / output_file
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def version(self) -> t.Optional[str]:
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def source_url(self) -> t.Optional[str]:
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
def _create_full_tmp_dir_path(self):
|
|
95
|
+
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
|
|
97
|
+
@SourceConnectionNetworkError.wrap
|
|
98
|
+
@requires_dependencies(dependencies=["slack_sdk"], extras="slack")
|
|
99
|
+
def _fetch_messages(self):
|
|
100
|
+
from slack_sdk import WebClient
|
|
101
|
+
|
|
102
|
+
self.client = WebClient(token=self.connector_config.access_config.token)
|
|
103
|
+
oldest = "0"
|
|
104
|
+
latest = "0"
|
|
105
|
+
if self.connector_config.start_date:
|
|
106
|
+
oldest = self.convert_datetime(self.connector_config.start_date)
|
|
107
|
+
|
|
108
|
+
if self.connector_config.end_date:
|
|
109
|
+
latest = self.convert_datetime(self.connector_config.end_date)
|
|
110
|
+
|
|
111
|
+
result = self.client.conversations_history(
|
|
112
|
+
channel=self.channel,
|
|
113
|
+
oldest=oldest,
|
|
114
|
+
latest=latest,
|
|
115
|
+
)
|
|
116
|
+
return result
|
|
117
|
+
|
|
118
|
+
def update_source_metadata(self, **kwargs):
|
|
119
|
+
result = kwargs.get("result", self._fetch_messages())
|
|
120
|
+
if result is None:
|
|
121
|
+
self.source_metadata = SourceMetadata(
|
|
122
|
+
exists=True,
|
|
123
|
+
)
|
|
124
|
+
return
|
|
125
|
+
timestamps = [m["ts"] for m in result["messages"]]
|
|
126
|
+
timestamps.sort()
|
|
127
|
+
date_created = None
|
|
128
|
+
date_modified = None
|
|
129
|
+
if len(timestamps) > 0:
|
|
130
|
+
date_created = datetime.fromtimestamp(float(timestamps[0])).isoformat()
|
|
131
|
+
date_modified = datetime.fromtimestamp(
|
|
132
|
+
float(timestamps[len(timestamps) - 1]),
|
|
133
|
+
).isoformat()
|
|
134
|
+
|
|
135
|
+
self.source_metadata = SourceMetadata(
|
|
136
|
+
date_created=date_created,
|
|
137
|
+
date_modified=date_modified,
|
|
138
|
+
exists=True,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
@SourceConnectionError.wrap
|
|
142
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
143
|
+
@requires_dependencies(dependencies=["slack_sdk"], extras="slack")
|
|
144
|
+
def get_file(self):
|
|
145
|
+
from slack_sdk.errors import SlackApiError
|
|
146
|
+
|
|
147
|
+
"""Fetches the data from a slack channel and stores it locally."""
|
|
148
|
+
|
|
149
|
+
self._create_full_tmp_dir_path()
|
|
150
|
+
|
|
151
|
+
result = self._fetch_messages()
|
|
152
|
+
self.update_source_metadata(result=result)
|
|
153
|
+
root = ET.Element("messages")
|
|
154
|
+
for message in result["messages"]:
|
|
155
|
+
message_elem = ET.SubElement(root, "message")
|
|
156
|
+
text_elem = ET.SubElement(message_elem, "text")
|
|
157
|
+
text_elem.text = message.get("text")
|
|
158
|
+
|
|
159
|
+
cursor = None
|
|
160
|
+
while True:
|
|
161
|
+
try:
|
|
162
|
+
response = self.client.conversations_replies(
|
|
163
|
+
channel=self.channel,
|
|
164
|
+
ts=message["ts"],
|
|
165
|
+
cursor=cursor,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
for reply in response["messages"]:
|
|
169
|
+
reply_msg = reply.get("text")
|
|
170
|
+
text_elem.text = "".join([str(text_elem.text), " <reply> ", reply_msg])
|
|
171
|
+
|
|
172
|
+
if not response["has_more"]:
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
cursor = response["response_metadata"]["next_cursor"]
|
|
176
|
+
|
|
177
|
+
except SlackApiError as e:
|
|
178
|
+
logger.error(f"Error retrieving replies: {e.response['error']}")
|
|
179
|
+
tree = ET.ElementTree(root)
|
|
180
|
+
tree.write(self._tmp_download_file(), encoding="utf-8", xml_declaration=True)
|
|
181
|
+
|
|
182
|
+
def convert_datetime(self, date_time):
|
|
183
|
+
for format in DATE_FORMATS:
|
|
184
|
+
try:
|
|
185
|
+
return datetime.strptime(date_time, format).timestamp()
|
|
186
|
+
except ValueError:
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def filename(self):
|
|
191
|
+
"""The filename of the file created from a slack channel"""
|
|
192
|
+
return self._tmp_download_file()
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class SlackSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
196
|
+
"""Objects of this class support fetching document(s) from"""
|
|
197
|
+
|
|
198
|
+
connector_config: SimpleSlackConfig
|
|
199
|
+
|
|
200
|
+
@requires_dependencies(dependencies=["slack_sdk"], extras="slack")
|
|
201
|
+
def check_connection(self):
|
|
202
|
+
from slack_sdk import WebClient
|
|
203
|
+
from slack_sdk.errors import SlackClientError
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
client = WebClient(token=self.connector_config.access_config.token)
|
|
207
|
+
client.users_identity()
|
|
208
|
+
except SlackClientError as slack_error:
|
|
209
|
+
logger.error(f"failed to validate connection: {slack_error}", exc_info=True)
|
|
210
|
+
raise SourceConnectionError(f"failed to validate connection: {slack_error}")
|
|
211
|
+
|
|
212
|
+
def initialize(self):
|
|
213
|
+
"""Verify that can get metadata for an object, validates connections info."""
|
|
214
|
+
|
|
215
|
+
def get_ingest_docs(self):
|
|
216
|
+
return [
|
|
217
|
+
SlackIngestDoc(
|
|
218
|
+
connector_config=self.connector_config,
|
|
219
|
+
processor_config=self.processor_config,
|
|
220
|
+
read_config=self.read_config,
|
|
221
|
+
channel=channel,
|
|
222
|
+
)
|
|
223
|
+
for channel in self.connector_config.channels
|
|
224
|
+
]
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
import typing as t
|
|
4
|
+
import uuid
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
|
+
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
10
|
+
from unstructured_ingest.interfaces import (
|
|
11
|
+
AccessConfig,
|
|
12
|
+
BaseConnectorConfig,
|
|
13
|
+
BaseDestinationConnector,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.logger import logger
|
|
16
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
+
|
|
18
|
+
ELEMENTS_TABLE_NAME = "elements"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class SqlAccessConfig(AccessConfig):
|
|
23
|
+
username: t.Optional[str]
|
|
24
|
+
password: t.Optional[str] = enhanced_field(sensitive=True)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class SimpleSqlConfig(BaseConnectorConfig):
|
|
29
|
+
db_type: t.Optional[str]
|
|
30
|
+
host: t.Optional[str]
|
|
31
|
+
database: t.Optional[str]
|
|
32
|
+
port: t.Optional[int]
|
|
33
|
+
access_config: SqlAccessConfig
|
|
34
|
+
|
|
35
|
+
def __post_init__(self):
|
|
36
|
+
if (self.db_type == "sqlite") and (self.database is None):
|
|
37
|
+
raise ValueError(
|
|
38
|
+
"A sqlite connection requires a path to a *.db file "
|
|
39
|
+
"through the `database` argument"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def connection(self):
|
|
44
|
+
if self.db_type == "postgresql":
|
|
45
|
+
return self._make_psycopg_connection
|
|
46
|
+
elif self.db_type == "sqlite":
|
|
47
|
+
return self._make_sqlite_connection
|
|
48
|
+
raise ValueError(f"Unsupported database {self.db_type} connection.")
|
|
49
|
+
|
|
50
|
+
def _make_sqlite_connection(self):
|
|
51
|
+
from sqlite3 import connect
|
|
52
|
+
|
|
53
|
+
return connect(database=self.database)
|
|
54
|
+
|
|
55
|
+
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
56
|
+
def _make_psycopg_connection(self):
|
|
57
|
+
from psycopg2 import connect
|
|
58
|
+
|
|
59
|
+
return connect(
|
|
60
|
+
user=self.access_config.username,
|
|
61
|
+
password=self.access_config.password,
|
|
62
|
+
dbname=self.database,
|
|
63
|
+
host=self.host,
|
|
64
|
+
port=self.port,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class SqlDestinationConnector(BaseDestinationConnector):
|
|
70
|
+
connector_config: SimpleSqlConfig
|
|
71
|
+
_client: t.Optional[t.Any] = field(init=False, default=None)
|
|
72
|
+
|
|
73
|
+
def to_dict(self, **kwargs):
|
|
74
|
+
"""
|
|
75
|
+
The _client variable in this dataclass breaks deepcopy due to:
|
|
76
|
+
TypeError: cannot pickle '_thread.lock' object
|
|
77
|
+
When serializing, remove it, meaning client data will need to be reinitialized
|
|
78
|
+
when deserialized
|
|
79
|
+
"""
|
|
80
|
+
self_cp = copy.copy(self)
|
|
81
|
+
if hasattr(self_cp, "_client"):
|
|
82
|
+
setattr(self_cp, "_client", None)
|
|
83
|
+
return _asdict(self_cp, **kwargs)
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def client(self):
|
|
87
|
+
if self._client is None:
|
|
88
|
+
self._client = self.connector_config.connection()
|
|
89
|
+
return self._client
|
|
90
|
+
|
|
91
|
+
@DestinationConnectionError.wrap
|
|
92
|
+
def initialize(self):
|
|
93
|
+
_ = self.client
|
|
94
|
+
|
|
95
|
+
def check_connection(self):
|
|
96
|
+
try:
|
|
97
|
+
cursor = self.client.cursor()
|
|
98
|
+
cursor.execute("SELECT 1;")
|
|
99
|
+
cursor.close()
|
|
100
|
+
except Exception as e:
|
|
101
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
102
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
103
|
+
|
|
104
|
+
def conform_dict(self, data: dict) -> None:
|
|
105
|
+
"""
|
|
106
|
+
Updates the element dictionary to conform to the sql schema
|
|
107
|
+
"""
|
|
108
|
+
from datetime import datetime
|
|
109
|
+
|
|
110
|
+
data["id"] = str(uuid.uuid4())
|
|
111
|
+
|
|
112
|
+
# Dict as string formatting
|
|
113
|
+
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
|
|
114
|
+
# Explicit casting otherwise fails schema type checking
|
|
115
|
+
data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
|
|
116
|
+
|
|
117
|
+
# Array of items as string formatting
|
|
118
|
+
if (embeddings := data.get("embeddings")) and (
|
|
119
|
+
self.connector_config.db_type != "postgresql"
|
|
120
|
+
):
|
|
121
|
+
data["embeddings"] = str(json.dumps(embeddings))
|
|
122
|
+
|
|
123
|
+
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
124
|
+
data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
|
|
125
|
+
|
|
126
|
+
if links := data.get("metadata", {}).get("links", {}):
|
|
127
|
+
data["metadata"]["links"] = str(json.dumps(links))
|
|
128
|
+
|
|
129
|
+
if permissions_data := (
|
|
130
|
+
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
|
131
|
+
):
|
|
132
|
+
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
|
|
133
|
+
|
|
134
|
+
if sent_from := data.get("metadata", {}).get("sent_from", {}):
|
|
135
|
+
data["metadata"]["sent_from"] = str(json.dumps(sent_from))
|
|
136
|
+
|
|
137
|
+
if sent_to := data.get("metadata", {}).get("sent_to", {}):
|
|
138
|
+
data["metadata"]["sent_to"] = str(json.dumps(sent_to))
|
|
139
|
+
|
|
140
|
+
# Datetime formatting
|
|
141
|
+
if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
|
|
142
|
+
data["metadata"]["data_source"]["date_created"] = datetime.fromisoformat(date_created)
|
|
143
|
+
|
|
144
|
+
if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
|
|
145
|
+
data["metadata"]["data_source"]["date_modified"] = datetime.fromisoformat(date_modified)
|
|
146
|
+
|
|
147
|
+
if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
|
|
148
|
+
data["metadata"]["data_source"]["date_processed"] = datetime.fromisoformat(
|
|
149
|
+
date_processed
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if last_modified := data.get("metadata", {}).get("last_modified", {}):
|
|
153
|
+
data["metadata"]["last_modified"] = datetime.fromisoformat(last_modified)
|
|
154
|
+
|
|
155
|
+
# String casting
|
|
156
|
+
if version := data.get("metadata", {}).get("data_source", {}).get("version"):
|
|
157
|
+
data["metadata"]["data_source"]["version"] = str(version)
|
|
158
|
+
|
|
159
|
+
if page_number := data.get("metadata", {}).get("page_number"):
|
|
160
|
+
data["metadata"]["page_number"] = str(page_number)
|
|
161
|
+
|
|
162
|
+
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
163
|
+
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
|
164
|
+
|
|
165
|
+
if data.get("metadata", {}).get("data_source", None):
|
|
166
|
+
data.update(data.get("metadata", {}).pop("data_source", None))
|
|
167
|
+
if data.get("metadata", {}).get("coordinates", None):
|
|
168
|
+
data.update(data.get("metadata", {}).pop("coordinates", None))
|
|
169
|
+
if data.get("metadata", {}):
|
|
170
|
+
data.update(data.pop("metadata", None))
|
|
171
|
+
|
|
172
|
+
@DestinationConnectionError.wrap
|
|
173
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
174
|
+
logger.info(
|
|
175
|
+
f"writing {len(elements_dict)} objects to database {self.connector_config.database} "
|
|
176
|
+
f"at {self.connector_config.host}"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
with self.client as conn:
|
|
180
|
+
cursor = conn.cursor()
|
|
181
|
+
|
|
182
|
+
# Since we have no guarantee that each element will have the same keys
|
|
183
|
+
# we insert each element individually
|
|
184
|
+
for elem in elements_dict:
|
|
185
|
+
query = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(elem.keys())}) \
|
|
186
|
+
VALUES({','.join(['?' if self.connector_config.db_type=='sqlite' else '%s' for x in elem])})" # noqa E501
|
|
187
|
+
values = []
|
|
188
|
+
for v in elem.values():
|
|
189
|
+
if self.connector_config.db_type == "sqlite" and isinstance(v, list):
|
|
190
|
+
values.append(json.dumps(v))
|
|
191
|
+
else:
|
|
192
|
+
values.append(v)
|
|
193
|
+
cursor.execute(query, values)
|
|
194
|
+
|
|
195
|
+
conn.commit()
|
|
196
|
+
cursor.close()
|
|
197
|
+
|
|
198
|
+
# Leaving contexts doesn't close the connection, so doing it here
|
|
199
|
+
conn.close()
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import json
|
|
3
|
+
import typing as t
|
|
4
|
+
import uuid
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
9
|
+
from unstructured_ingest.interfaces import (
|
|
10
|
+
AccessConfig,
|
|
11
|
+
BaseConnectorConfig,
|
|
12
|
+
BaseDestinationConnector,
|
|
13
|
+
BaseIngestDoc,
|
|
14
|
+
WriteConfig,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.logger import logger
|
|
17
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
18
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
19
|
+
|
|
20
|
+
BASE_URL = "https://api.vectara.io/v1"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class VectaraAccessConfig(AccessConfig):
|
|
25
|
+
oauth_client_id: str = enhanced_field(sensitive=True)
|
|
26
|
+
oauth_secret: str = enhanced_field(sensitive=True)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class SimpleVectaraConfig(BaseConnectorConfig):
|
|
31
|
+
access_config: VectaraAccessConfig
|
|
32
|
+
customer_id: str
|
|
33
|
+
corpus_name: t.Optional[str] = None
|
|
34
|
+
corpus_id: t.Optional[str] = None
|
|
35
|
+
token_url: str = "https://vectara-prod-{}.auth.us-west-2.amazoncognito.com/oauth2/token"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class VectaraDestinationConnector(BaseDestinationConnector):
|
|
40
|
+
write_config: WriteConfig
|
|
41
|
+
connector_config: SimpleVectaraConfig
|
|
42
|
+
_jwt_token: t.Optional[str] = field(init=False, default=None)
|
|
43
|
+
_jwt_token_expires_ts: t.Optional[float] = field(init=False, default=None)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def jwt_token(self):
|
|
47
|
+
if (
|
|
48
|
+
not self._jwt_token
|
|
49
|
+
or self._jwt_token_expires_ts - datetime.datetime.now().timestamp() <= 60
|
|
50
|
+
):
|
|
51
|
+
self._jwt_token = self._get_jwt_token()
|
|
52
|
+
return self._jwt_token
|
|
53
|
+
|
|
54
|
+
@DestinationConnectionError.wrap
|
|
55
|
+
def vectara(self):
|
|
56
|
+
"""
|
|
57
|
+
Check the connection for Vectara and validate corpus exists.
|
|
58
|
+
- If more than one corpus with the same name exists - then return a message
|
|
59
|
+
- If exactly one corpus exists with this name - use it.
|
|
60
|
+
- If does not exist - create it.
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
# Get token if not already set
|
|
64
|
+
self.jwt_token
|
|
65
|
+
|
|
66
|
+
list_corpora_response = self._request(
|
|
67
|
+
endpoint="list-corpora",
|
|
68
|
+
data={"numResults": 1, "filter": self.connector_config.corpus_name},
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
possible_corpora_ids_names_map = {
|
|
72
|
+
corpus.get("id"): corpus.get("name")
|
|
73
|
+
for corpus in list_corpora_response.get("corpus")
|
|
74
|
+
if corpus.get("name") == self.connector_config.corpus_name
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if len(possible_corpora_ids_names_map) > 1:
|
|
78
|
+
return f"Multiple Corpora exist with name {self.connector_config.corpus_name}"
|
|
79
|
+
if len(possible_corpora_ids_names_map) == 1:
|
|
80
|
+
self.connector_config.corpus_id = list(possible_corpora_ids_names_map.keys())[0]
|
|
81
|
+
else:
|
|
82
|
+
data = {
|
|
83
|
+
"corpus": {
|
|
84
|
+
"name": self.connector_config.corpus_name,
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
create_corpus_response = self._request(endpoint="create-corpus", data=data)
|
|
88
|
+
self.connector_config.corpus_id = create_corpus_response.get("corpusId")
|
|
89
|
+
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"failed to create Vectara connection: {e}", exc_info=True)
|
|
92
|
+
raise DestinationConnectionError(f"failed to create Vectara connection: {e}")
|
|
93
|
+
|
|
94
|
+
def initialize(self):
|
|
95
|
+
self.vectara()
|
|
96
|
+
|
|
97
|
+
@requires_dependencies(["requests"], extras="vectara")
|
|
98
|
+
def _request(
|
|
99
|
+
self,
|
|
100
|
+
endpoint: str,
|
|
101
|
+
http_method: str = "POST",
|
|
102
|
+
params: t.Mapping[str, t.Any] = None,
|
|
103
|
+
data: t.Mapping[str, t.Any] = None,
|
|
104
|
+
):
|
|
105
|
+
import requests
|
|
106
|
+
|
|
107
|
+
url = f"{BASE_URL}/{endpoint}"
|
|
108
|
+
|
|
109
|
+
headers = {
|
|
110
|
+
"Content-Type": "application/json",
|
|
111
|
+
"Accept": "application/json",
|
|
112
|
+
"Authorization": f"Bearer {self.jwt_token}",
|
|
113
|
+
"customer-id": self.connector_config.customer_id,
|
|
114
|
+
"X-source": "unstructured",
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
response = requests.request(
|
|
118
|
+
method=http_method, url=url, headers=headers, params=params, data=json.dumps(data)
|
|
119
|
+
)
|
|
120
|
+
response.raise_for_status()
|
|
121
|
+
return response.json()
|
|
122
|
+
|
|
123
|
+
# Get Oauth2 JWT token
|
|
124
|
+
@requires_dependencies(["requests"], extras="vectara")
|
|
125
|
+
def _get_jwt_token(self):
|
|
126
|
+
import requests
|
|
127
|
+
|
|
128
|
+
"""Connect to the server and get a JWT token."""
|
|
129
|
+
token_endpoint = self.connector_config.token_url.format(self.connector_config.customer_id)
|
|
130
|
+
headers = {
|
|
131
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
132
|
+
}
|
|
133
|
+
data = {
|
|
134
|
+
"grant_type": "client_credentials",
|
|
135
|
+
"client_id": self.connector_config.access_config.oauth_client_id,
|
|
136
|
+
"client_secret": self.connector_config.access_config.oauth_secret,
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
response = requests.request(method="POST", url=token_endpoint, headers=headers, data=data)
|
|
140
|
+
response.raise_for_status()
|
|
141
|
+
response_json = response.json()
|
|
142
|
+
|
|
143
|
+
request_time = datetime.datetime.now().timestamp()
|
|
144
|
+
self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
|
|
145
|
+
|
|
146
|
+
return response_json.get("access_token")
|
|
147
|
+
|
|
148
|
+
@DestinationConnectionError.wrap
|
|
149
|
+
def check_connection(self):
|
|
150
|
+
try:
|
|
151
|
+
self.vectara()
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
154
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
155
|
+
|
|
156
|
+
def _delete_doc(self, doc_id: str) -> None:
|
|
157
|
+
"""
|
|
158
|
+
Delete a document from the Vectara corpus.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
url (str): URL of the page to delete.
|
|
162
|
+
doc_id (str): ID of the document to delete.
|
|
163
|
+
"""
|
|
164
|
+
body = {
|
|
165
|
+
"customer_id": self.connector_config.customer_id,
|
|
166
|
+
"corpus_id": self.connector_config.corpus_id,
|
|
167
|
+
"document_id": doc_id,
|
|
168
|
+
}
|
|
169
|
+
self._request(endpoint="delete-doc", data=body)
|
|
170
|
+
|
|
171
|
+
def _index_document(self, document: t.Dict[str, t.Any]) -> None:
|
|
172
|
+
"""
|
|
173
|
+
Index a document (by uploading it to the Vectara corpus) from the document dictionary
|
|
174
|
+
"""
|
|
175
|
+
body = {
|
|
176
|
+
"customer_id": self.connector_config.customer_id,
|
|
177
|
+
"corpus_id": self.connector_config.corpus_id,
|
|
178
|
+
"document": document,
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
result = self._request(endpoint="index", data=body, http_method="POST")
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.info(f"exception {e} while indexing document {document['documentId']}")
|
|
185
|
+
return
|
|
186
|
+
|
|
187
|
+
if (
|
|
188
|
+
"status" in result
|
|
189
|
+
and result["status"]
|
|
190
|
+
and (
|
|
191
|
+
"ALREADY_EXISTS" in result["status"]["code"]
|
|
192
|
+
or (
|
|
193
|
+
"CONFLICT" in result["status"]["code"]
|
|
194
|
+
and "Indexing doesn't support updating documents"
|
|
195
|
+
in result["status"]["statusDetail"]
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
):
|
|
199
|
+
logger.info(f"document {document['documentId']} already exists, re-indexing")
|
|
200
|
+
self._delete_doc(document["documentId"])
|
|
201
|
+
result = self._request(endpoint="index", data=body, http_method="POST")
|
|
202
|
+
return
|
|
203
|
+
|
|
204
|
+
if "status" in result and result["status"] and "OK" in result["status"]["code"]:
|
|
205
|
+
logger.info(f"indexing document {document['documentId']} succeeded")
|
|
206
|
+
else:
|
|
207
|
+
logger.info(f"indexing document {document['documentId']} failed, response = {result}")
|
|
208
|
+
|
|
209
|
+
def write_dict(self, *args, docs_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
210
|
+
logger.info(f"inserting / updating {len(docs_list)} documents to Vectara ")
|
|
211
|
+
for vdoc in docs_list:
|
|
212
|
+
self._index_document(vdoc)
|
|
213
|
+
|
|
214
|
+
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
|
215
|
+
docs_list: t.Dict[t.Dict[str, t.Any]] = []
|
|
216
|
+
|
|
217
|
+
def get_metadata(element) -> t.Dict[str, t.Any]:
|
|
218
|
+
"""
|
|
219
|
+
Select which meta-data fields to include and optionally map them to a new new.
|
|
220
|
+
remove the "metadata-" prefix from the keys
|
|
221
|
+
"""
|
|
222
|
+
metadata_map = {
|
|
223
|
+
"page_number": "page_number",
|
|
224
|
+
"data_source-url": "url",
|
|
225
|
+
"filename": "filename",
|
|
226
|
+
"filetype": "filetype",
|
|
227
|
+
"last_modified": "last_modified",
|
|
228
|
+
}
|
|
229
|
+
md = flatten_dict(element, separator="-", flatten_lists=True)
|
|
230
|
+
md = {k.replace("metadata-", ""): v for k, v in md.items()}
|
|
231
|
+
md = {metadata_map[k]: v for k, v in md.items() if k in metadata_map}
|
|
232
|
+
return md
|
|
233
|
+
|
|
234
|
+
for doc in docs:
|
|
235
|
+
local_path = doc._output_filename
|
|
236
|
+
with open(local_path) as json_file:
|
|
237
|
+
dict_content = json.load(json_file)
|
|
238
|
+
vdoc = {
|
|
239
|
+
"documentId": str(uuid.uuid4()),
|
|
240
|
+
"title": dict_content[0].get("metadata", {}).get("data_source", {}).get("url"),
|
|
241
|
+
"section": [
|
|
242
|
+
{
|
|
243
|
+
"text": element.pop("text", None),
|
|
244
|
+
"metadataJson": json.dumps(get_metadata(element)),
|
|
245
|
+
}
|
|
246
|
+
for element in dict_content
|
|
247
|
+
],
|
|
248
|
+
}
|
|
249
|
+
logger.info(
|
|
250
|
+
f"Extending {len(vdoc)} json elements from content in {local_path}",
|
|
251
|
+
)
|
|
252
|
+
docs_list.append(vdoc)
|
|
253
|
+
self.write_dict(docs_list=docs_list)
|