unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.fsspec.azure import (
|
|
9
|
+
AzureWriteConfig,
|
|
10
|
+
SimpleAzureBlobStorageConfig,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class AzureWriter(Writer):
|
|
16
|
+
connector_config: "SimpleAzureBlobStorageConfig"
|
|
17
|
+
write_config: "AzureWriteConfig"
|
|
18
|
+
|
|
19
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
20
|
+
from unstructured_ingest.connector.fsspec.azure import (
|
|
21
|
+
AzureBlobStorageDestinationConnector,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
return AzureBlobStorageDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.fsspec.box import BoxWriteConfig, SimpleBoxConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class BoxWriter(Writer):
|
|
13
|
+
connector_config: "SimpleBoxConfig"
|
|
14
|
+
write_config: "BoxWriteConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.fsspec.box import (
|
|
18
|
+
BoxDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return BoxDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.fsspec.dropbox import DropboxWriteConfig, SimpleDropboxConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class DropboxWriter(Writer):
|
|
13
|
+
connector_config: "SimpleDropboxConfig"
|
|
14
|
+
write_config: "DropboxWriteConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.fsspec.dropbox import (
|
|
18
|
+
DropboxDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return DropboxDestinationConnector
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.fsspec.gcs import GcsWriteConfig, SimpleGcsConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class GcsWriter(Writer):
|
|
13
|
+
connector_config: "SimpleGcsConfig"
|
|
14
|
+
write_config: "GcsWriteConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.fsspec.gcs import GcsDestinationConnector
|
|
18
|
+
|
|
19
|
+
return GcsDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.fsspec.s3 import S3WriteConfig, SimpleS3Config
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class S3Writer(Writer):
|
|
13
|
+
connector_config: "SimpleS3Config"
|
|
14
|
+
write_config: "S3WriteConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.fsspec.s3 import (
|
|
18
|
+
S3DestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return S3DestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.kafka import KafkaWriteConfig, SimpleKafkaConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class KafkaWriter(Writer):
|
|
13
|
+
write_config: "KafkaWriteConfig"
|
|
14
|
+
connector_config: "SimpleKafkaConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.kafka import (
|
|
18
|
+
KafkaDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return KafkaDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.mongodb import MongoDBWriteConfig, SimpleMongoDBConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class MongodbWriter(Writer):
|
|
13
|
+
write_config: "MongoDBWriteConfig"
|
|
14
|
+
connector_config: "SimpleMongoDBConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.mongodb import (
|
|
18
|
+
MongoDBDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return MongoDBDestinationConnector
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.elasticsearch import (
|
|
9
|
+
ElasticsearchWriteConfig,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.connector.opensearch import (
|
|
12
|
+
SimpleOpenSearchConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class OpenSearchWriter(Writer):
|
|
18
|
+
connector_config: "SimpleOpenSearchConfig"
|
|
19
|
+
write_config: "ElasticsearchWriteConfig"
|
|
20
|
+
|
|
21
|
+
def get_connector_cls(self) -> BaseDestinationConnector:
|
|
22
|
+
from unstructured_ingest.connector.opensearch import (
|
|
23
|
+
OpenSearchDestinationConnector,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
return OpenSearchDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.pinecone import PineconeWriteConfig, SimplePineconeConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class PineconeWriter(Writer):
|
|
13
|
+
write_config: "PineconeWriteConfig"
|
|
14
|
+
connector_config: "SimplePineconeConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.pinecone import (
|
|
18
|
+
PineconeDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return PineconeDestinationConnector
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.qdrant import QdrantWriteConfig, SimpleQdrantConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class QdrantWriter(Writer):
|
|
13
|
+
write_config: "QdrantWriteConfig"
|
|
14
|
+
connector_config: "SimpleQdrantConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.qdrant import QdrantDestinationConnector
|
|
18
|
+
|
|
19
|
+
return QdrantDestinationConnector
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.sql import SimpleSqlConfig
|
|
9
|
+
from unstructured_ingest.interfaces import WriteConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class SqlWriter(Writer):
|
|
14
|
+
write_config: "WriteConfig"
|
|
15
|
+
connector_config: "SimpleSqlConfig"
|
|
16
|
+
|
|
17
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
18
|
+
from unstructured_ingest.connector.sql import (
|
|
19
|
+
SqlDestinationConnector,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
return SqlDestinationConnector
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
5
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
6
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
7
|
+
|
|
8
|
+
if t.TYPE_CHECKING:
|
|
9
|
+
from unstructured_ingest.connector.vectara import SimpleVectaraConfig, VectaraWriteConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class VectaraWriter(Writer, EnhancedDataClassJsonMixin):
|
|
14
|
+
write_config: "VectaraWriteConfig"
|
|
15
|
+
connector_config: "SimpleVectaraConfig"
|
|
16
|
+
|
|
17
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
18
|
+
from unstructured_ingest.connector.vectara import (
|
|
19
|
+
VectaraDestinationConnector,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
return VectaraDestinationConnector
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.weaviate import SimpleWeaviateConfig, WeaviateWriteConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class WeaviateWriter(Writer):
|
|
13
|
+
write_config: "WeaviateWriteConfig"
|
|
14
|
+
connector_config: "SimpleWeaviateConfig"
|
|
15
|
+
|
|
16
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
17
|
+
from unstructured_ingest.connector.weaviate import (
|
|
18
|
+
WeaviateDestinationConnector,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return WeaviateDestinationConnector
|
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
import zlib
|
|
5
|
+
from itertools import groupby
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def id_to_hash(element: dict, sequence_number: int) -> str:
|
|
9
|
+
"""Calculates and assigns a deterministic hash as an ID.
|
|
10
|
+
|
|
11
|
+
The hash ID is based on element's text, sequence number on page,
|
|
12
|
+
page number and its filename.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
sequence_number: index on page
|
|
16
|
+
|
|
17
|
+
Returns: new ID value
|
|
18
|
+
"""
|
|
19
|
+
filename = element["metadata"].get("filename")
|
|
20
|
+
text = element["text"]
|
|
21
|
+
page_number = element["metadata"].get("page_number")
|
|
22
|
+
data = f"{filename}{text}{page_number}{sequence_number}"
|
|
23
|
+
element["element_id"] = hashlib.sha256(data.encode()).hexdigest()[:32]
|
|
24
|
+
return element["element_id"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
|
|
28
|
+
# -- generate sequence number for each element on a page --
|
|
29
|
+
elements = elements.copy()
|
|
30
|
+
page_numbers = [e["metadata"].get("page_number") for e in elements]
|
|
31
|
+
page_seq_pairs = [
|
|
32
|
+
seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
# -- assign hash IDs to elements --
|
|
36
|
+
old_to_new_mapping = {
|
|
37
|
+
element["element_id"]: id_to_hash(element=element, sequence_number=seq_on_page_counter)
|
|
38
|
+
for element, seq_on_page_counter in zip(elements, page_seq_pairs)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# -- map old parent IDs to new ones --
|
|
42
|
+
for e in elements:
|
|
43
|
+
parent_id = e["metadata"].get("parent_id")
|
|
44
|
+
if not parent_id:
|
|
45
|
+
continue
|
|
46
|
+
e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
|
|
47
|
+
|
|
48
|
+
return elements
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
|
|
52
|
+
decoded_b64_bytes = base64.b64decode(raw_s)
|
|
53
|
+
elements_json_bytes = zlib.decompress(decoded_b64_bytes)
|
|
54
|
+
elements_json_str = elements_json_bytes.decode("utf-8")
|
|
55
|
+
element_dicts = json.loads(elements_json_str)
|
|
56
|
+
return element_dicts
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import tarfile
|
|
5
|
+
import zipfile
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.connector.local import LocalSourceConnector, SimpleLocalConfig
|
|
11
|
+
from unstructured_ingest.interfaces import (
|
|
12
|
+
BaseConnectorConfig,
|
|
13
|
+
BaseSingleIngestDoc,
|
|
14
|
+
ProcessorConfig,
|
|
15
|
+
ReadConfig,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.logger import logger
|
|
18
|
+
|
|
19
|
+
ZIP_FILE_EXT = [".zip"]
|
|
20
|
+
TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def uncompress_file(filename: str, path: Optional[str] = None) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Takes in a compressed zip or tar file and decompresses it
|
|
26
|
+
"""
|
|
27
|
+
# Create path if it doesn't already exist
|
|
28
|
+
if path:
|
|
29
|
+
Path(path).mkdir(parents=True, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
if any(filename.endswith(ext) for ext in ZIP_FILE_EXT):
|
|
32
|
+
return uncompress_zip_file(zip_filename=filename, path=path)
|
|
33
|
+
elif any(filename.endswith(ext) for ext in TAR_FILE_EXT):
|
|
34
|
+
return uncompress_tar_file(tar_filename=filename, path=path)
|
|
35
|
+
else:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
"filename {} not a recognized compressed extension: {}".format(
|
|
38
|
+
filename,
|
|
39
|
+
", ".join(ZIP_FILE_EXT + TAR_FILE_EXT),
|
|
40
|
+
),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def uncompress_zip_file(zip_filename: str, path: Optional[str] = None) -> str:
|
|
45
|
+
head, tail = os.path.split(zip_filename)
|
|
46
|
+
for ext in ZIP_FILE_EXT:
|
|
47
|
+
if tail.endswith(ext):
|
|
48
|
+
tail = tail[: -(len(ext))]
|
|
49
|
+
break
|
|
50
|
+
path = path if path else os.path.join(head, f"{tail}-zip-uncompressed")
|
|
51
|
+
logger.info(f"extracting zip {zip_filename} -> {path}")
|
|
52
|
+
with zipfile.ZipFile(zip_filename) as zfile:
|
|
53
|
+
zfile.extractall(path=path)
|
|
54
|
+
return path
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
|
|
58
|
+
head, tail = os.path.split(tar_filename)
|
|
59
|
+
for ext in TAR_FILE_EXT:
|
|
60
|
+
if tail.endswith(ext):
|
|
61
|
+
tail = tail[: -(len(ext))]
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
|
|
65
|
+
logger.info(f"extracting tar {tar_filename} -> {path}")
|
|
66
|
+
# NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
|
|
67
|
+
with tarfile.open(tar_filename, "r:*") as tfile:
|
|
68
|
+
# NOTE(robinson): Mitigate against malicious content being extracted from the tar file.
|
|
69
|
+
# This was added in Python 3.12
|
|
70
|
+
# Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
|
|
71
|
+
if sys.version_info >= (3, 12):
|
|
72
|
+
tfile.extraction_filter = tarfile.tar_filter
|
|
73
|
+
else:
|
|
74
|
+
logger.warning(
|
|
75
|
+
"Extraction filtering for tar files is available for Python 3.12 and above. "
|
|
76
|
+
"Consider upgrading your Python version to improve security. "
|
|
77
|
+
"See https://docs.python.org/3/library/tarfile.html#extraction-filters"
|
|
78
|
+
)
|
|
79
|
+
tfile.extractall(path=path)
|
|
80
|
+
return path
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class CompressionSourceConnectorMixin:
|
|
85
|
+
processor_config: ProcessorConfig
|
|
86
|
+
read_config: ReadConfig
|
|
87
|
+
connector_config: BaseConnectorConfig
|
|
88
|
+
|
|
89
|
+
def process_compressed_doc(self, doc: BaseSingleIngestDoc) -> List[BaseSingleIngestDoc]:
|
|
90
|
+
"""
|
|
91
|
+
Utility function which helps process compressed files. Extracts the contents and returns
|
|
92
|
+
generated ingest docs via local source connector
|
|
93
|
+
"""
|
|
94
|
+
# Download the raw file to local
|
|
95
|
+
doc.get_file()
|
|
96
|
+
path = uncompress_file(filename=str(doc.filename))
|
|
97
|
+
new_read_configs = copy.copy(self.read_config)
|
|
98
|
+
new_process_configs = copy.copy(self.processor_config)
|
|
99
|
+
relative_path = path.replace(self.read_config.download_dir, "")
|
|
100
|
+
|
|
101
|
+
if self.processor_config.output_dir.endswith(os.sep):
|
|
102
|
+
new_process_configs.output_dir = f"{self.processor_config.output_dir}{relative_path}"
|
|
103
|
+
else:
|
|
104
|
+
new_process_configs.output_dir = (
|
|
105
|
+
f"{self.processor_config.output_dir}{os.sep}{relative_path}"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
local_connector = LocalSourceConnector(
|
|
109
|
+
connector_config=SimpleLocalConfig(
|
|
110
|
+
input_path=path,
|
|
111
|
+
recursive=True,
|
|
112
|
+
),
|
|
113
|
+
read_config=new_read_configs,
|
|
114
|
+
processor_config=new_process_configs,
|
|
115
|
+
)
|
|
116
|
+
logger.info(f"created local source connector: {local_connector.to_json()}")
|
|
117
|
+
local_connector.initialize()
|
|
118
|
+
return local_connector.get_ingest_docs()
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import json
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
6
|
+
|
|
7
|
+
import ndjson
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.v2.logger import logger
|
|
11
|
+
|
|
12
|
+
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
13
|
+
|
|
14
|
+
T = TypeVar("T")
|
|
15
|
+
IterableT = Iterable[T]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def split_dataframe(df: pd.DataFrame, chunk_size: int = 100) -> Generator[pd.DataFrame, None, None]:
|
|
19
|
+
num_chunks = len(df) // chunk_size + 1
|
|
20
|
+
for i in range(num_chunks):
|
|
21
|
+
yield df[i * chunk_size : (i + 1) * chunk_size]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
|
|
25
|
+
"""A helper function to break an iterable into batches of size batch_size."""
|
|
26
|
+
it = iter(iterable)
|
|
27
|
+
chunk = tuple(itertools.islice(it, batch_size))
|
|
28
|
+
while chunk:
|
|
29
|
+
yield chunk
|
|
30
|
+
chunk = tuple(itertools.islice(it, batch_size))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def generator_batching_wbytes(
|
|
34
|
+
iterable: IterableT,
|
|
35
|
+
batch_size_limit_bytes: Optional[int] = None,
|
|
36
|
+
max_batch_size: Optional[int] = None,
|
|
37
|
+
) -> IterableT:
|
|
38
|
+
if not batch_size_limit_bytes and not max_batch_size:
|
|
39
|
+
return iterable
|
|
40
|
+
"""A helper function to break an iterable into chunks of specified bytes."""
|
|
41
|
+
current_batch, current_batch_size = [], 0
|
|
42
|
+
|
|
43
|
+
for item in iterable:
|
|
44
|
+
item_size_bytes = len(json.dumps(item).encode("utf-8"))
|
|
45
|
+
if batch_size_limit_bytes and current_batch_size + item_size_bytes > batch_size_limit_bytes:
|
|
46
|
+
yield current_batch
|
|
47
|
+
current_batch, current_batch_size = [item], item_size_bytes
|
|
48
|
+
continue
|
|
49
|
+
if max_batch_size and len(current_batch) + 1 > max_batch_size:
|
|
50
|
+
yield current_batch
|
|
51
|
+
current_batch, current_batch_size = [item], item_size_bytes
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
current_batch.append(item)
|
|
55
|
+
current_batch_size += item_size_bytes
|
|
56
|
+
|
|
57
|
+
if current_batch:
|
|
58
|
+
yield current_batch
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def flatten_dict(
|
|
62
|
+
dictionary: dict[str, Any],
|
|
63
|
+
parent_key: str = "",
|
|
64
|
+
separator: str = "_",
|
|
65
|
+
flatten_lists: bool = False,
|
|
66
|
+
remove_none: bool = False,
|
|
67
|
+
keys_to_omit: Optional[Sequence[str]] = None,
|
|
68
|
+
) -> dict[str, Any]:
|
|
69
|
+
"""Flattens a nested dictionary into a single level dictionary.
|
|
70
|
+
|
|
71
|
+
keys_to_omit is a list of keys that don't get flattened. If omitting a nested key, format as
|
|
72
|
+
{parent_key}{separator}{key}. If flatten_lists is True, then lists and tuples are flattened as
|
|
73
|
+
well. If remove_none is True, then None keys/values are removed from the flattened
|
|
74
|
+
dictionary.
|
|
75
|
+
"""
|
|
76
|
+
keys_to_omit = keys_to_omit if keys_to_omit else []
|
|
77
|
+
flattened_dict: dict[str, Any] = {}
|
|
78
|
+
for key, value in dictionary.items():
|
|
79
|
+
new_key = f"{parent_key}{separator}{key}" if parent_key else key
|
|
80
|
+
if new_key in keys_to_omit:
|
|
81
|
+
flattened_dict[new_key] = value
|
|
82
|
+
elif value is None and remove_none:
|
|
83
|
+
continue
|
|
84
|
+
elif isinstance(value, dict):
|
|
85
|
+
value = cast("dict[str, Any]", value)
|
|
86
|
+
flattened_dict.update(
|
|
87
|
+
flatten_dict(
|
|
88
|
+
value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
|
|
89
|
+
),
|
|
90
|
+
)
|
|
91
|
+
elif isinstance(value, (list, tuple)) and flatten_lists:
|
|
92
|
+
value = cast("list[Any] | tuple[Any]", value)
|
|
93
|
+
for index, item in enumerate(value):
|
|
94
|
+
flattened_dict.update(
|
|
95
|
+
flatten_dict(
|
|
96
|
+
{f"{new_key}{separator}{index}": item},
|
|
97
|
+
"",
|
|
98
|
+
separator,
|
|
99
|
+
flatten_lists,
|
|
100
|
+
remove_none,
|
|
101
|
+
keys_to_omit=keys_to_omit,
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
else:
|
|
105
|
+
flattened_dict[new_key] = value
|
|
106
|
+
|
|
107
|
+
return flattened_dict
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def validate_date_args(date: Optional[str] = None) -> bool:
|
|
111
|
+
"""Validate whether the provided date string satisfies any of the supported date formats.
|
|
112
|
+
|
|
113
|
+
Used by unstructured/ingest/connector/biomed.py
|
|
114
|
+
|
|
115
|
+
Returns `True` if the date string satisfies any of the supported formats, otherwise raises
|
|
116
|
+
`ValueError`.
|
|
117
|
+
|
|
118
|
+
Supported Date Formats:
|
|
119
|
+
- 'YYYY-MM-DD'
|
|
120
|
+
- 'YYYY-MM-DDTHH:MM:SS'
|
|
121
|
+
- 'YYYY-MM-DD+HH:MM:SS'
|
|
122
|
+
- 'YYYY-MM-DDTHH:MM:SS±HHMM'
|
|
123
|
+
"""
|
|
124
|
+
if not date:
|
|
125
|
+
raise ValueError("The argument date is None.")
|
|
126
|
+
|
|
127
|
+
for format in DATE_FORMATS:
|
|
128
|
+
try:
|
|
129
|
+
datetime.strptime(date, format)
|
|
130
|
+
return True
|
|
131
|
+
except ValueError:
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
raise ValueError(
|
|
135
|
+
f"The argument {date} does not satisfy the format:"
|
|
136
|
+
f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def get_data_by_suffix(path: Path) -> list[dict]:
|
|
141
|
+
with path.open() as f:
|
|
142
|
+
if path.suffix == ".json":
|
|
143
|
+
return json.load(f)
|
|
144
|
+
elif path.suffix == ".ndjson":
|
|
145
|
+
return ndjson.load(f)
|
|
146
|
+
elif path.suffix == ".csv":
|
|
147
|
+
df = pd.read_csv(path)
|
|
148
|
+
return df.to_dict(orient="records")
|
|
149
|
+
elif path.suffix == ".parquet":
|
|
150
|
+
df = pd.read_parquet(path)
|
|
151
|
+
return df.to_dict(orient="records")
|
|
152
|
+
else:
|
|
153
|
+
raise ValueError(f"Unsupported file type: {path}")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_data(path: Path) -> list[dict]:
|
|
157
|
+
try:
|
|
158
|
+
return get_data_by_suffix(path=path)
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.warning(f"failed to read {path} by extension: {e}")
|
|
161
|
+
# Fall back
|
|
162
|
+
with path.open() as f:
|
|
163
|
+
try:
|
|
164
|
+
return json.load(f)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.warning(f"failed to read {path} as json: {e}")
|
|
167
|
+
try:
|
|
168
|
+
return ndjson.load(f)
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logger.warning(f"failed to read {path} as ndjson: {e}")
|
|
171
|
+
try:
|
|
172
|
+
df = pd.read_csv(path)
|
|
173
|
+
return df.to_dict(orient="records")
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.warning(f"failed to read {path} as csv: {e}")
|
|
176
|
+
try:
|
|
177
|
+
df = pd.read_parquet(path)
|
|
178
|
+
return df.to_dict(orient="records")
|
|
179
|
+
except Exception as e:
|
|
180
|
+
logger.warning(f"failed to read {path} as parquet: {e}")
|
|
181
|
+
|
|
182
|
+
raise IOError(f"File could not be parsed: {path}")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def get_data_df(path: Path) -> pd.DataFrame:
|
|
186
|
+
with path.open() as f:
|
|
187
|
+
if path.suffix == ".json":
|
|
188
|
+
data = json.load(f)
|
|
189
|
+
return pd.DataFrame(data=data)
|
|
190
|
+
elif path.suffix == ".ndjson":
|
|
191
|
+
data = ndjson.load(f)
|
|
192
|
+
return pd.DataFrame(data=data)
|
|
193
|
+
elif path.suffix == ".csv":
|
|
194
|
+
df = pd.read_csv(path)
|
|
195
|
+
return df
|
|
196
|
+
elif path.suffix == ".parquet":
|
|
197
|
+
df = pd.read_parquet(path)
|
|
198
|
+
return df
|
|
199
|
+
else:
|
|
200
|
+
raise ValueError(f"Unsupported file type: {path}")
|