unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Optional, TypedDict, TypeVar, Union
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
10
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DownloaderConfig(BaseModel):
|
|
14
|
+
download_dir: Optional[Path] = Field(
|
|
15
|
+
default=None,
|
|
16
|
+
description="Where files are downloaded to, defaults to a location at"
|
|
17
|
+
"`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DownloadResponse(TypedDict):
|
|
25
|
+
file_data: FileData
|
|
26
|
+
path: Path
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
download_responses = Union[list[DownloadResponse], DownloadResponse]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Downloader(BaseProcess, BaseConnector, ABC):
|
|
33
|
+
connector_type: str
|
|
34
|
+
download_config: DownloaderConfigT
|
|
35
|
+
|
|
36
|
+
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
37
|
+
if not file_data.source_identifiers:
|
|
38
|
+
return None
|
|
39
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
40
|
+
if not rel_path:
|
|
41
|
+
return None
|
|
42
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
43
|
+
return self.download_dir / Path(rel_path)
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def is_float(value: str):
|
|
47
|
+
try:
|
|
48
|
+
float(value)
|
|
49
|
+
return True
|
|
50
|
+
except ValueError:
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
def generate_download_response(
|
|
54
|
+
self, file_data: FileData, download_path: Path
|
|
55
|
+
) -> DownloadResponse:
|
|
56
|
+
if (
|
|
57
|
+
file_data.metadata.date_modified
|
|
58
|
+
and self.is_float(file_data.metadata.date_modified)
|
|
59
|
+
and file_data.metadata.date_created
|
|
60
|
+
and self.is_float(file_data.metadata.date_created)
|
|
61
|
+
):
|
|
62
|
+
date_modified = float(file_data.metadata.date_modified)
|
|
63
|
+
date_created = float(file_data.metadata.date_created)
|
|
64
|
+
os.utime(download_path, times=(date_created, date_modified))
|
|
65
|
+
file_data.local_download_path = str(download_path.resolve())
|
|
66
|
+
return DownloadResponse(file_data=file_data, path=download_path)
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def download_dir(self) -> Path:
|
|
70
|
+
if self.download_config.download_dir is None:
|
|
71
|
+
self.download_config.download_dir = (
|
|
72
|
+
Path.home()
|
|
73
|
+
/ ".cache"
|
|
74
|
+
/ "unstructured"
|
|
75
|
+
/ "ingest"
|
|
76
|
+
/ "download"
|
|
77
|
+
/ self.connector_type
|
|
78
|
+
).resolve()
|
|
79
|
+
return self.download_config.download_dir
|
|
80
|
+
|
|
81
|
+
def is_async(self) -> bool:
|
|
82
|
+
return True
|
|
83
|
+
|
|
84
|
+
@abstractmethod
|
|
85
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
89
|
+
return self.run(file_data=file_data, **kwargs)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.v2.logger import logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SourceIdentifiers(BaseModel):
|
|
12
|
+
filename: str
|
|
13
|
+
fullpath: str
|
|
14
|
+
rel_path: Optional[str] = None
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def filename_stem(self) -> str:
|
|
18
|
+
return Path(self.filename).stem
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def relative_path(self) -> str:
|
|
22
|
+
return self.rel_path or self.fullpath
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class FileDataSourceMetadata(BaseModel):
|
|
26
|
+
url: Optional[str] = None
|
|
27
|
+
version: Optional[str] = None
|
|
28
|
+
record_locator: Optional[dict[str, Any]] = None
|
|
29
|
+
date_created: Optional[str] = None
|
|
30
|
+
date_modified: Optional[str] = None
|
|
31
|
+
date_processed: Optional[str] = None
|
|
32
|
+
permissions_data: Optional[list[dict[str, Any]]] = None
|
|
33
|
+
filesize_bytes: Optional[int] = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FileData(BaseModel):
|
|
37
|
+
identifier: str
|
|
38
|
+
connector_type: str
|
|
39
|
+
source_identifiers: SourceIdentifiers
|
|
40
|
+
metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
|
|
41
|
+
additional_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
42
|
+
reprocess: bool = False
|
|
43
|
+
local_download_path: Optional[str] = None
|
|
44
|
+
display_name: Optional[str] = None
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_file(cls, path: str) -> "FileData":
|
|
48
|
+
path = Path(path).resolve()
|
|
49
|
+
if not path.exists() or not path.is_file():
|
|
50
|
+
raise ValueError(f"file path not valid: {path}")
|
|
51
|
+
with open(str(path.resolve()), "rb") as f:
|
|
52
|
+
file_data_dict = json.load(f)
|
|
53
|
+
file_data = cls.model_validate(file_data_dict)
|
|
54
|
+
return file_data
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
|
|
58
|
+
file_data_dict = file_data.model_dump()
|
|
59
|
+
return cls.model_validate(file_data_dict, **kwargs)
|
|
60
|
+
|
|
61
|
+
def to_file(self, path: str) -> None:
|
|
62
|
+
path = Path(path).resolve()
|
|
63
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
with open(str(path.resolve()), "w") as f:
|
|
65
|
+
json.dump(self.model_dump(), f, indent=2)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class BatchItem(BaseModel):
|
|
69
|
+
identifier: str
|
|
70
|
+
version: Optional[str] = None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BatchFileData(FileData):
|
|
74
|
+
identifier: str = Field(init=False)
|
|
75
|
+
batch_items: list[BatchItem]
|
|
76
|
+
source_identifiers: Optional[SourceIdentifiers] = None
|
|
77
|
+
|
|
78
|
+
@field_validator("batch_items")
|
|
79
|
+
@classmethod
|
|
80
|
+
def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
|
|
81
|
+
if not v:
|
|
82
|
+
raise ValueError("batch items cannot be empty")
|
|
83
|
+
all_identifiers = [item.identifier for item in v]
|
|
84
|
+
if len(all_identifiers) != len(set(all_identifiers)):
|
|
85
|
+
raise ValueError(f"duplicate identifiers: {all_identifiers}")
|
|
86
|
+
sorted_batch_items = sorted(v, key=lambda item: item.identifier)
|
|
87
|
+
return sorted_batch_items
|
|
88
|
+
|
|
89
|
+
@model_validator(mode="before")
|
|
90
|
+
@classmethod
|
|
91
|
+
def populate_identifier(cls, data: Any) -> Any:
|
|
92
|
+
if isinstance(data, dict) and "identifier" not in data:
|
|
93
|
+
batch_items = data["batch_items"]
|
|
94
|
+
identifier_data = json.dumps(
|
|
95
|
+
{item.identifier: item.version for item in batch_items}, sort_keys=True
|
|
96
|
+
)
|
|
97
|
+
data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
|
|
98
|
+
return data
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def file_data_from_file(path: str) -> FileData:
|
|
102
|
+
try:
|
|
103
|
+
return BatchFileData.from_file(path=path)
|
|
104
|
+
except ValidationError:
|
|
105
|
+
logger.debug(f"{path} not valid for batch file data")
|
|
106
|
+
|
|
107
|
+
return FileData.from_file(path=path)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def file_data_from_dict(data: dict) -> FileData:
|
|
111
|
+
try:
|
|
112
|
+
return BatchFileData.model_validate(data)
|
|
113
|
+
except ValidationError:
|
|
114
|
+
logger.debug(f"{data} not valid for batch file data")
|
|
115
|
+
|
|
116
|
+
return FileData.model_validate(data)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
7
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
8
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class IndexerConfig(BaseModel):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
IndexerConfigT = TypeVar("IndexerConfigT", bound=IndexerConfig)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Indexer(BaseProcess, BaseConnector, ABC):
|
|
19
|
+
connector_type: str
|
|
20
|
+
index_config: Optional[IndexerConfigT] = None
|
|
21
|
+
|
|
22
|
+
def is_async(self) -> bool:
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
|
|
30
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class BaseProcess(ABC):
|
|
8
|
+
def is_async(self) -> bool:
|
|
9
|
+
return False
|
|
10
|
+
|
|
11
|
+
def precheck(self) -> None:
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def run(self, **kwargs: Any) -> Any:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
async def run_async(self, **kwargs: Any) -> Any:
|
|
19
|
+
return self.run(**kwargs)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from asyncio import Semaphore
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
7
|
+
|
|
8
|
+
DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ProcessorConfig(BaseModel):
|
|
12
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
13
|
+
|
|
14
|
+
reprocess: bool = Field(
|
|
15
|
+
default=False,
|
|
16
|
+
description="Reprocess a downloaded file even if the relevant structured "
|
|
17
|
+
"output .json file in output directory already exists.",
|
|
18
|
+
)
|
|
19
|
+
verbose: bool = Field(default=False)
|
|
20
|
+
tqdm: bool = Field(default=False, description="Display tqdm progress bar")
|
|
21
|
+
work_dir: str = Field(
|
|
22
|
+
default_factory=lambda: DEFAULT_WORK_DIR,
|
|
23
|
+
description="Where to place working files when processing each step",
|
|
24
|
+
)
|
|
25
|
+
num_processes: int = Field(
|
|
26
|
+
default=2, description="Number of parallel processes with which to process docs"
|
|
27
|
+
)
|
|
28
|
+
max_connections: Optional[int] = Field(
|
|
29
|
+
default=None, description="Limit of concurrent connectionts"
|
|
30
|
+
)
|
|
31
|
+
raise_on_error: bool = Field(
|
|
32
|
+
default=False,
|
|
33
|
+
description="Is set, will raise error if any doc in the pipeline fail. "
|
|
34
|
+
"Otherwise will log error and continue with other docs",
|
|
35
|
+
)
|
|
36
|
+
disable_parallelism: bool = Field(
|
|
37
|
+
default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true",
|
|
38
|
+
)
|
|
39
|
+
preserve_downloads: bool = Field(
|
|
40
|
+
default=False, description="Don't delete downloaded files after process completes"
|
|
41
|
+
)
|
|
42
|
+
download_only: bool = Field(
|
|
43
|
+
default=False, description="skip the rest of the process after files are downloaded"
|
|
44
|
+
)
|
|
45
|
+
re_download: bool = Field(
|
|
46
|
+
default=False,
|
|
47
|
+
description="If set, will re-download downloaded files "
|
|
48
|
+
"regardless of if they already exist locally",
|
|
49
|
+
)
|
|
50
|
+
uncompress: bool = Field(
|
|
51
|
+
default=False,
|
|
52
|
+
description="Uncompress any archived files. Currently supporting "
|
|
53
|
+
"zip and tar files based on file extension.",
|
|
54
|
+
)
|
|
55
|
+
iter_delete: bool = Field(
|
|
56
|
+
default=False,
|
|
57
|
+
description="If limited on memory, this can be enabled to delete "
|
|
58
|
+
"cached content as it's used and no longer needed in the pipeline.",
|
|
59
|
+
)
|
|
60
|
+
delete_cache: bool = Field(
|
|
61
|
+
default=False,
|
|
62
|
+
description="If set, will delete the cache work directory when process finishes",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# OTEL support
|
|
66
|
+
otel_endpoint: Optional[str] = Field(
|
|
67
|
+
default=None, description="OTEL endpoint to publish trace data to"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Used to keep track of state in pipeline
|
|
71
|
+
status: dict = Field(default_factory=dict)
|
|
72
|
+
semaphore: Optional[Semaphore] = Field(init=False, default=None, exclude=True)
|
|
73
|
+
|
|
74
|
+
def model_post_init(self, __context: Any) -> None:
|
|
75
|
+
if self.max_connections is not None:
|
|
76
|
+
self.semaphore = Semaphore(self.max_connections)
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def mp_supported(self) -> bool:
|
|
80
|
+
return not self.disable_parallelism and self.num_processes > 1
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def async_supported(self) -> bool:
|
|
84
|
+
if self.disable_parallelism:
|
|
85
|
+
return False
|
|
86
|
+
if self.max_connections is not None and isinstance(self.max_connections, int):
|
|
87
|
+
return self.max_connections > 1
|
|
88
|
+
return True
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, TypeVar
|
|
6
|
+
|
|
7
|
+
import ndjson
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
11
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UploadStagerConfig(BaseModel):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class UploadStager(BaseProcess, ABC):
|
|
23
|
+
upload_stager_config: UploadStagerConfigT
|
|
24
|
+
|
|
25
|
+
def write_output(self, output_path: Path, data: list[dict]) -> None:
|
|
26
|
+
if output_path.suffix == ".json":
|
|
27
|
+
with output_path.open("w") as f:
|
|
28
|
+
json.dump(data, f, indent=2)
|
|
29
|
+
elif output_path.suffix == ".ndjson":
|
|
30
|
+
with output_path.open("w") as f:
|
|
31
|
+
ndjson.dump(data, f)
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError(f"Unsupported output format: {output_path}")
|
|
34
|
+
|
|
35
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
36
|
+
return element_dict
|
|
37
|
+
|
|
38
|
+
def get_output_path(self, output_filename: str, output_dir: Path) -> Path:
|
|
39
|
+
output_path = Path(output_filename)
|
|
40
|
+
output_filename = f"{Path(output_filename).stem}{output_path.suffix}"
|
|
41
|
+
output_path = Path(output_dir) / Path(f"{output_filename}")
|
|
42
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
return output_path
|
|
44
|
+
|
|
45
|
+
def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
46
|
+
with input_file.open() as in_f:
|
|
47
|
+
reader = ndjson.reader(in_f)
|
|
48
|
+
with output_file.open("w") as out_f:
|
|
49
|
+
writer = ndjson.writer(out_f)
|
|
50
|
+
for element in reader:
|
|
51
|
+
conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
|
|
52
|
+
writer.writerow(row=conformed_element)
|
|
53
|
+
writer.f.flush()
|
|
54
|
+
|
|
55
|
+
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
56
|
+
with input_file.open() as in_f:
|
|
57
|
+
elements_contents = json.load(in_f)
|
|
58
|
+
|
|
59
|
+
conformed_elements = [
|
|
60
|
+
self.conform_dict(element_dict=element, file_data=file_data)
|
|
61
|
+
for element in elements_contents
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
with open(output_file, "w") as out_f:
|
|
65
|
+
json.dump(conformed_elements, out_f, indent=2)
|
|
66
|
+
|
|
67
|
+
def run(
|
|
68
|
+
self,
|
|
69
|
+
elements_filepath: Path,
|
|
70
|
+
file_data: FileData,
|
|
71
|
+
output_dir: Path,
|
|
72
|
+
output_filename: str,
|
|
73
|
+
**kwargs: Any,
|
|
74
|
+
) -> Path:
|
|
75
|
+
output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
76
|
+
if elements_filepath.suffix == ".ndjson":
|
|
77
|
+
self.stream_update(
|
|
78
|
+
input_file=elements_filepath, output_file=output_file, file_data=file_data
|
|
79
|
+
)
|
|
80
|
+
elif elements_filepath.suffix == ".json":
|
|
81
|
+
self.process_whole(
|
|
82
|
+
input_file=elements_filepath, output_file=output_file, file_data=file_data
|
|
83
|
+
)
|
|
84
|
+
else:
|
|
85
|
+
raise ValueError(f"Unsupported file extension: {elements_filepath}")
|
|
86
|
+
return output_file
|
|
87
|
+
|
|
88
|
+
async def run_async(
|
|
89
|
+
self,
|
|
90
|
+
elements_filepath: Path,
|
|
91
|
+
file_data: FileData,
|
|
92
|
+
output_dir: Path,
|
|
93
|
+
output_filename: str,
|
|
94
|
+
**kwargs: Any,
|
|
95
|
+
) -> Path:
|
|
96
|
+
return self.run(
|
|
97
|
+
elements_filepath=elements_filepath,
|
|
98
|
+
output_dir=output_dir,
|
|
99
|
+
output_filename=output_filename,
|
|
100
|
+
file_data=file_data,
|
|
101
|
+
**kwargs,
|
|
102
|
+
)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.utils.data_prep import get_data
|
|
9
|
+
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
10
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
11
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UploaderConfig(BaseModel):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
UploaderConfigT = TypeVar("UploaderConfigT", bound=UploaderConfig)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class UploadContent:
|
|
23
|
+
path: Path
|
|
24
|
+
file_data: FileData
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class Uploader(BaseProcess, BaseConnector, ABC):
|
|
29
|
+
upload_config: UploaderConfigT
|
|
30
|
+
connector_type: str
|
|
31
|
+
|
|
32
|
+
def is_async(self) -> bool:
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
def is_batch(self) -> bool:
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
def run_batch(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
39
|
+
raise NotImplementedError()
|
|
40
|
+
|
|
41
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
42
|
+
data = get_data(path=path)
|
|
43
|
+
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
44
|
+
|
|
45
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
46
|
+
data = get_data(path=path)
|
|
47
|
+
await self.run_data_async(data=data, file_data=file_data, **kwargs)
|
|
48
|
+
|
|
49
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
50
|
+
raise NotImplementedError()
|
|
51
|
+
|
|
52
|
+
async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
53
|
+
return self.run_data(data=data, file_data=file_data, **kwargs)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from logging import Formatter, Logger, StreamHandler, getLevelName, getLogger
|
|
5
|
+
from typing import Any, Callable
|
|
6
|
+
|
|
7
|
+
log_level = os.getenv("INGEST_LOG_LEVEL", "INFO")
|
|
8
|
+
LOGGER_NAME = "unstructured_ingest.v2"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def default_is_data_sensitive(k: str, v: Any) -> bool:
|
|
12
|
+
sensitive_fields = [
|
|
13
|
+
"account_name",
|
|
14
|
+
"client_id",
|
|
15
|
+
]
|
|
16
|
+
sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
|
|
17
|
+
return (
|
|
18
|
+
v
|
|
19
|
+
and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
|
|
20
|
+
or k.lower() in sensitive_fields
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def hide_sensitive_fields(
|
|
25
|
+
data: dict, is_sensitive_fn: Callable[[str, Any], bool] = default_is_data_sensitive
|
|
26
|
+
) -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Will recursively look through every k, v pair in this dict and any nested ones and run
|
|
29
|
+
is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
|
|
30
|
+
any string value can be parsed as valid json and process that dict as well and replace
|
|
31
|
+
the original string with the json.dumps() version of the redacted dict.
|
|
32
|
+
"""
|
|
33
|
+
new_data = data.copy()
|
|
34
|
+
for k, v in new_data.items():
|
|
35
|
+
if is_sensitive_fn(k, v):
|
|
36
|
+
new_data[k] = "*******"
|
|
37
|
+
if isinstance(v, dict):
|
|
38
|
+
new_data[k] = hide_sensitive_fields(v)
|
|
39
|
+
if isinstance(v, str):
|
|
40
|
+
# Need to take into account strings generated via json.dumps() or simply printing a dict
|
|
41
|
+
try:
|
|
42
|
+
json_data = json.loads(v)
|
|
43
|
+
if isinstance(json_data, dict):
|
|
44
|
+
updated_data = hide_sensitive_fields(json_data)
|
|
45
|
+
new_data[k] = json.dumps(updated_data)
|
|
46
|
+
except json.JSONDecodeError:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
return new_data
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def redact_jsons(s: str) -> str:
|
|
53
|
+
"""
|
|
54
|
+
Takes in a generic string and pulls out all valid json content. Leverages
|
|
55
|
+
hide_sensitive_fields() to redact any sensitive information and replaces the
|
|
56
|
+
original json with the new redacted format. There can be any number of valid
|
|
57
|
+
jsons in a generic string and this will work. Having extra '{' without a
|
|
58
|
+
closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
chars = list(s)
|
|
62
|
+
if "{" not in chars:
|
|
63
|
+
return s
|
|
64
|
+
i = 0
|
|
65
|
+
jsons = []
|
|
66
|
+
i = 0
|
|
67
|
+
while i < len(chars):
|
|
68
|
+
char = chars[i]
|
|
69
|
+
if char == "{":
|
|
70
|
+
stack = [char]
|
|
71
|
+
current = [char]
|
|
72
|
+
while len(stack) != 0 and i < len(chars):
|
|
73
|
+
i += 1
|
|
74
|
+
char = chars[i]
|
|
75
|
+
current.append(char)
|
|
76
|
+
if char == "{":
|
|
77
|
+
stack.append(char)
|
|
78
|
+
if char == "}":
|
|
79
|
+
stack.pop(-1)
|
|
80
|
+
jsons.append("".join(current))
|
|
81
|
+
continue
|
|
82
|
+
i += 1
|
|
83
|
+
for j in jsons:
|
|
84
|
+
try:
|
|
85
|
+
formatted_j = json.dumps(json.loads(j))
|
|
86
|
+
except json.JSONDecodeError:
|
|
87
|
+
lit = ast.literal_eval(j)
|
|
88
|
+
formatted_j = json.dumps(lit)
|
|
89
|
+
hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
|
|
90
|
+
s = s.replace(j, hidden_j)
|
|
91
|
+
return s
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class SensitiveFormatter(Formatter):
|
|
95
|
+
def format(self, record):
|
|
96
|
+
s = super().format(record=record)
|
|
97
|
+
try:
|
|
98
|
+
return redact_jsons(s)
|
|
99
|
+
except Exception:
|
|
100
|
+
return f"Failed to redact: {s}"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def remove_root_handlers(logger: Logger) -> None:
|
|
104
|
+
# NOTE(robinson): in some environments such as Google Colab, there is a root handler
|
|
105
|
+
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
106
|
+
# Removing these when they exist prevents this behavior
|
|
107
|
+
if logger.root.hasHandlers():
|
|
108
|
+
for handler in logger.root.handlers:
|
|
109
|
+
logger.root.removeHandler(handler)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def make_default_logger(level: int) -> Logger:
|
|
113
|
+
"""Return a custom logger."""
|
|
114
|
+
logger = getLogger(LOGGER_NAME)
|
|
115
|
+
handler = StreamHandler()
|
|
116
|
+
handler.name = "ingest_log_handler"
|
|
117
|
+
formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
118
|
+
handler.setFormatter(formatter)
|
|
119
|
+
if handler.name not in [h.name for h in logger.handlers]:
|
|
120
|
+
logger.addHandler(handler)
|
|
121
|
+
logger.setLevel(level)
|
|
122
|
+
remove_root_handlers(logger)
|
|
123
|
+
return logger
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
logger = make_default_logger(level=getLevelName(log_level.upper()))
|