unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import typing as t
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("unstructured_ingest")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def default_is_data_sensitive(k: str, v: t.Any) -> bool:
|
|
10
|
+
sensitive_fields = [
|
|
11
|
+
"account_name",
|
|
12
|
+
"client_id",
|
|
13
|
+
]
|
|
14
|
+
sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
|
|
15
|
+
return (
|
|
16
|
+
v
|
|
17
|
+
and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
|
|
18
|
+
or k.lower() in sensitive_fields
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def hide_sensitive_fields(
|
|
23
|
+
data: dict, is_sensitive_fn: t.Callable[[str, t.Any], bool] = default_is_data_sensitive
|
|
24
|
+
) -> dict:
|
|
25
|
+
"""
|
|
26
|
+
Will recursively look through every k, v pair in this dict and any nested ones and run
|
|
27
|
+
is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
|
|
28
|
+
any string value can be parsed as valid json and process that dict as well and replace
|
|
29
|
+
the original string with the json.dumps() version of the redacted dict.
|
|
30
|
+
"""
|
|
31
|
+
new_data = data.copy()
|
|
32
|
+
for k, v in new_data.items():
|
|
33
|
+
if is_sensitive_fn(k, v):
|
|
34
|
+
new_data[k] = "*******"
|
|
35
|
+
if isinstance(v, dict):
|
|
36
|
+
new_data[k] = hide_sensitive_fields(v)
|
|
37
|
+
if isinstance(v, str):
|
|
38
|
+
# Need to take into account strings generated via json.dumps() or simply printing a dict
|
|
39
|
+
try:
|
|
40
|
+
json_data = json.loads(v)
|
|
41
|
+
if isinstance(json_data, dict):
|
|
42
|
+
updated_data = hide_sensitive_fields(json_data)
|
|
43
|
+
new_data[k] = json.dumps(updated_data)
|
|
44
|
+
except json.JSONDecodeError:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
return new_data
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def redact_jsons(s: str) -> str:
|
|
51
|
+
"""
|
|
52
|
+
Takes in a generic string and pulls out all valid json content. Leverages
|
|
53
|
+
hide_sensitive_fields() to redact any sensitive information and replaces the
|
|
54
|
+
original json with the new redacted format. There can be any number of valid
|
|
55
|
+
jsons in a generic string and this will work. Having extra '{' without a
|
|
56
|
+
closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
|
|
57
|
+
|
|
58
|
+
"""
|
|
59
|
+
chars = list(s)
|
|
60
|
+
if "{" not in chars:
|
|
61
|
+
return s
|
|
62
|
+
i = 0
|
|
63
|
+
jsons = []
|
|
64
|
+
i = 0
|
|
65
|
+
while i < len(chars):
|
|
66
|
+
char = chars[i]
|
|
67
|
+
if char == "{":
|
|
68
|
+
stack = [char]
|
|
69
|
+
current = [char]
|
|
70
|
+
while len(stack) != 0 and i < len(chars):
|
|
71
|
+
i += 1
|
|
72
|
+
char = chars[i]
|
|
73
|
+
current.append(char)
|
|
74
|
+
if char == "{":
|
|
75
|
+
stack.append(char)
|
|
76
|
+
if char == "}":
|
|
77
|
+
stack.pop(-1)
|
|
78
|
+
jsons.append("".join(current))
|
|
79
|
+
continue
|
|
80
|
+
i += 1
|
|
81
|
+
for j in jsons:
|
|
82
|
+
try:
|
|
83
|
+
formatted_j = json.dumps(json.loads(j))
|
|
84
|
+
except json.JSONDecodeError:
|
|
85
|
+
formatted_j = json.dumps(ast.literal_eval(j))
|
|
86
|
+
hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
|
|
87
|
+
s = s.replace(j, hidden_j)
|
|
88
|
+
return s
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class SensitiveFormatter(logging.Formatter):
|
|
92
|
+
def format(self, record):
|
|
93
|
+
s = super().format(record=record)
|
|
94
|
+
return redact_jsons(s)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def remove_root_handlers(logger: logging.Logger) -> None:
|
|
98
|
+
# NOTE(robinson): in some environments such as Google Colab, there is a root handler
|
|
99
|
+
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
100
|
+
# Removing these when they exist prevents this behavior
|
|
101
|
+
if logger.root.hasHandlers():
|
|
102
|
+
for handler in logger.root.handlers:
|
|
103
|
+
logger.root.removeHandler(handler)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def ingest_log_streaming_init(level: int) -> None:
|
|
107
|
+
handler = logging.StreamHandler()
|
|
108
|
+
handler.name = "ingest_log_handler"
|
|
109
|
+
formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
110
|
+
handler.setFormatter(formatter)
|
|
111
|
+
|
|
112
|
+
# Only want to add the handler once
|
|
113
|
+
if "ingest_log_handler" not in [h.name for h in logger.handlers]:
|
|
114
|
+
logger.addHandler(handler)
|
|
115
|
+
|
|
116
|
+
remove_root_handlers(logger)
|
|
117
|
+
logger.setLevel(level)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def make_default_logger(level: int) -> logging.Logger:
|
|
121
|
+
"""Return a custom logger."""
|
|
122
|
+
logger = logging.getLogger("unstructured_ingest")
|
|
123
|
+
handler = logging.StreamHandler()
|
|
124
|
+
handler.name = "ingest_log_handler"
|
|
125
|
+
formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
126
|
+
handler.setFormatter(formatter)
|
|
127
|
+
logger.addHandler(handler)
|
|
128
|
+
logger.setLevel(level)
|
|
129
|
+
remove_root_handlers(logger)
|
|
130
|
+
return logger
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from .doc_factory import DocFactory
|
|
2
|
+
from .interfaces import PipelineContext, ReformatNode
|
|
3
|
+
from .partition import Partitioner
|
|
4
|
+
from .permissions import PermissionsDataCleaner
|
|
5
|
+
from .pipeline import Pipeline
|
|
6
|
+
from .reformat.chunking import Chunker
|
|
7
|
+
from .reformat.embedding import Embedder
|
|
8
|
+
from .source import Reader
|
|
9
|
+
from .write import Writer
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"DocFactory",
|
|
13
|
+
"Partitioner",
|
|
14
|
+
"Reader",
|
|
15
|
+
"Embedder",
|
|
16
|
+
"PipelineContext",
|
|
17
|
+
"Pipeline",
|
|
18
|
+
"Writer",
|
|
19
|
+
"Chunker",
|
|
20
|
+
"ReformatNode",
|
|
21
|
+
"PermissionsDataCleaner",
|
|
22
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
|
|
6
|
+
from unstructured_ingest.logger import logger
|
|
7
|
+
from unstructured_ingest.pipeline.interfaces import CopyNode
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Copier(CopyNode):
|
|
11
|
+
def run(self, json_path: str):
|
|
12
|
+
filename = os.path.basename(json_path)
|
|
13
|
+
doc_hash = os.path.splitext(filename)[0]
|
|
14
|
+
ingest_doc_dict = self.pipeline_context.ingest_docs_map[doc_hash]
|
|
15
|
+
ingest_doc = create_ingest_doc_from_dict(ingest_doc_dict)
|
|
16
|
+
desired_output = ingest_doc._output_filename
|
|
17
|
+
Path(desired_output).parent.mkdir(parents=True, exist_ok=True)
|
|
18
|
+
logger.info(f"copying {json_path} -> {desired_output}")
|
|
19
|
+
shutil.copy(json_path, desired_output)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.pipeline.interfaces import DocFactoryNode
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class DocFactory(DocFactoryNode):
|
|
9
|
+
def run(self, *args, **kwargs) -> t.Iterable[dict]:
|
|
10
|
+
docs = self.source_doc_connector.get_ingest_docs()
|
|
11
|
+
json_docs = [doc.to_dict() for doc in docs]
|
|
12
|
+
return json_docs
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import multiprocessing as mp
|
|
5
|
+
import typing as t
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from multiprocessing.managers import DictProxy
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from dataclasses_json import DataClassJsonMixin
|
|
12
|
+
|
|
13
|
+
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
14
|
+
from unstructured_ingest.interfaces import (
|
|
15
|
+
BaseDestinationConnector,
|
|
16
|
+
BaseSourceConnector,
|
|
17
|
+
PartitionConfig,
|
|
18
|
+
ProcessorConfig,
|
|
19
|
+
ReadConfig,
|
|
20
|
+
RetryStrategyConfig,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.logger import ingest_log_streaming_init, logger
|
|
23
|
+
|
|
24
|
+
if t.TYPE_CHECKING:
|
|
25
|
+
from unstructured_ingest.ingest_backoff import RetryHandler
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class PipelineContext(ProcessorConfig):
|
|
30
|
+
"""
|
|
31
|
+
Data that gets shared across each pipeline node
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __post_init__(self):
|
|
35
|
+
self._ingest_docs_map: t.Optional[DictProxy] = None
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def ingest_docs_map(self) -> DictProxy:
|
|
39
|
+
if self._ingest_docs_map is None:
|
|
40
|
+
raise ValueError("ingest_docs_map never initialized")
|
|
41
|
+
return self._ingest_docs_map
|
|
42
|
+
|
|
43
|
+
@ingest_docs_map.setter
|
|
44
|
+
def ingest_docs_map(self, value: DictProxy):
|
|
45
|
+
self._ingest_docs_map = value
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class PipelineNode(DataClassJsonMixin, ABC):
|
|
50
|
+
"""
|
|
51
|
+
Class that encapsulates logic to run during a single pipeline step
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
pipeline_context: PipelineContext
|
|
55
|
+
|
|
56
|
+
def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any:
|
|
57
|
+
iterable = iterable if iterable else []
|
|
58
|
+
if iterable:
|
|
59
|
+
logger.info(
|
|
60
|
+
f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
self.initialize()
|
|
64
|
+
if not self.supported_multiprocessing():
|
|
65
|
+
if iterable:
|
|
66
|
+
self.result = self.run(iterable)
|
|
67
|
+
else:
|
|
68
|
+
self.result = self.run()
|
|
69
|
+
elif self.pipeline_context.num_processes == 1:
|
|
70
|
+
if iterable:
|
|
71
|
+
self.result = [self.run(it) for it in iterable]
|
|
72
|
+
else:
|
|
73
|
+
self.result = self.run()
|
|
74
|
+
else:
|
|
75
|
+
with mp.Pool(
|
|
76
|
+
processes=self.pipeline_context.num_processes,
|
|
77
|
+
initializer=ingest_log_streaming_init,
|
|
78
|
+
initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,),
|
|
79
|
+
) as pool:
|
|
80
|
+
self.result = pool.map(self.run, iterable)
|
|
81
|
+
# Remove None which may be caused by failed docs that didn't raise an error
|
|
82
|
+
if isinstance(self.result, t.Iterable):
|
|
83
|
+
self.result = [r for r in self.result if r is not None]
|
|
84
|
+
return self.result
|
|
85
|
+
|
|
86
|
+
def supported_multiprocessing(self) -> bool:
|
|
87
|
+
return True
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def run(self, *args, **kwargs) -> t.Optional[t.Any]:
|
|
91
|
+
pass
|
|
92
|
+
|
|
93
|
+
def initialize(self):
|
|
94
|
+
if path := self.get_path():
|
|
95
|
+
logger.info(f"creating {path}")
|
|
96
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
97
|
+
ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
|
|
98
|
+
|
|
99
|
+
def get_path(self) -> t.Optional[Path]:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class DocFactoryNode(PipelineNode):
|
|
105
|
+
"""
|
|
106
|
+
Encapsulated logic to generate a list of ingest docs
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
source_doc_connector: BaseSourceConnector
|
|
110
|
+
|
|
111
|
+
def initialize(self):
|
|
112
|
+
logger.info(
|
|
113
|
+
f"Running doc factory to generate ingest docs. "
|
|
114
|
+
f"Source connector: {self.source_doc_connector.to_json()}",
|
|
115
|
+
)
|
|
116
|
+
super().initialize()
|
|
117
|
+
self.source_doc_connector.initialize()
|
|
118
|
+
|
|
119
|
+
@abstractmethod
|
|
120
|
+
def run(self, *args, **kwargs) -> t.Iterable[dict]:
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
def supported_multiprocessing(self) -> bool:
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class SourceNode(PipelineNode):
|
|
129
|
+
"""A pipeline node representing logic to pull data from a source using base ingest documents.
|
|
130
|
+
|
|
131
|
+
This class encapsulates the logic for pulling data from a specified source using base ingest
|
|
132
|
+
documents. The output of this logic is expected to be in JSON format representing the data
|
|
133
|
+
itself.
|
|
134
|
+
|
|
135
|
+
Attributes:
|
|
136
|
+
read_config: A configuration object specifying how to read data from the source.
|
|
137
|
+
retry_strategy_config: Optional configuration specifying the strategy for network errors.
|
|
138
|
+
|
|
139
|
+
Properties:
|
|
140
|
+
retry_strategy: A retry handler configured based on the retry strategy configuration.
|
|
141
|
+
|
|
142
|
+
Methods:
|
|
143
|
+
initialize: Initializes the source node and logs the process.
|
|
144
|
+
run: Abstract method for downloading data associated with ingest documents.
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
read_config: ReadConfig
|
|
148
|
+
retry_strategy_config: t.Optional[RetryStrategyConfig] = None
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def retry_strategy(self) -> t.Optional["RetryHandler"]:
|
|
152
|
+
if retry_strategy_config := self.retry_strategy_config:
|
|
153
|
+
import backoff
|
|
154
|
+
|
|
155
|
+
from unstructured_ingest.ingest_backoff import RetryHandler
|
|
156
|
+
|
|
157
|
+
return RetryHandler(
|
|
158
|
+
backoff.expo,
|
|
159
|
+
SourceConnectionNetworkError,
|
|
160
|
+
max_time=retry_strategy_config.max_retry_time,
|
|
161
|
+
max_tries=retry_strategy_config.max_retries,
|
|
162
|
+
logger=logger,
|
|
163
|
+
start_log_level=logger.level,
|
|
164
|
+
backoff_log_level=logger.level,
|
|
165
|
+
)
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
def initialize(self):
|
|
169
|
+
logger.info("Running source node to download data associated with ingest docs")
|
|
170
|
+
super().initialize()
|
|
171
|
+
|
|
172
|
+
@abstractmethod
|
|
173
|
+
def run(self, ingest_doc_json: str) -> t.Optional[str]:
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@dataclass
|
|
178
|
+
class PartitionNode(PipelineNode):
|
|
179
|
+
"""
|
|
180
|
+
Encapsulates logic to run partition on the json files as the output of the source node
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
partition_config: PartitionConfig
|
|
184
|
+
partition_kwargs: dict = field(default_factory=dict)
|
|
185
|
+
|
|
186
|
+
def initialize(self):
|
|
187
|
+
logger.info(
|
|
188
|
+
f"Running partition node to extract content from json files. "
|
|
189
|
+
f"Config: {self.partition_config.to_json()}, "
|
|
190
|
+
f"partition kwargs: {json.dumps(self.partition_kwargs)}]",
|
|
191
|
+
)
|
|
192
|
+
super().initialize()
|
|
193
|
+
|
|
194
|
+
def create_hash(self) -> str:
|
|
195
|
+
hash_dict = self.partition_config.to_dict()
|
|
196
|
+
hash_dict["partition_kwargs"] = self.partition_kwargs
|
|
197
|
+
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
|
198
|
+
|
|
199
|
+
@abstractmethod
|
|
200
|
+
def run(self, json_path: str) -> t.Optional[str]:
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
def get_path(self) -> Path:
|
|
204
|
+
return (Path(self.pipeline_context.work_dir) / "partitioned").resolve()
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@dataclass
|
|
208
|
+
class ReformatNode(PipelineNode, ABC):
|
|
209
|
+
"""
|
|
210
|
+
Encapsulated any logic to reformat the output List[Element]
|
|
211
|
+
content from partition before writing it
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
@abstractmethod
|
|
215
|
+
def run(self, elements_json: str) -> t.Optional[str]:
|
|
216
|
+
pass
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
@dataclass
|
|
220
|
+
class WriteNode(PipelineNode):
|
|
221
|
+
"""
|
|
222
|
+
Encapsulated logic to write the final result to a downstream data connection
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
dest_doc_connector: BaseDestinationConnector
|
|
226
|
+
|
|
227
|
+
@abstractmethod
|
|
228
|
+
def run(self, json_paths: t.List[str]):
|
|
229
|
+
pass
|
|
230
|
+
|
|
231
|
+
def initialize(self):
|
|
232
|
+
logger.info(
|
|
233
|
+
f"Running write node to upload content. "
|
|
234
|
+
f"Destination connector: {self.dest_doc_connector.to_json(redact_sensitive=True)}]",
|
|
235
|
+
)
|
|
236
|
+
super().initialize()
|
|
237
|
+
self.dest_doc_connector.initialize()
|
|
238
|
+
|
|
239
|
+
def supported_multiprocessing(self) -> bool:
|
|
240
|
+
return False
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@dataclass
|
|
244
|
+
class CopyNode(PipelineNode):
|
|
245
|
+
"""
|
|
246
|
+
Encapsulated logic to copy the final result of the pipeline to the designated output location.
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
def initialize(self):
|
|
250
|
+
logger.info("Running copy node to move content to desired output location")
|
|
251
|
+
super().initialize()
|
|
252
|
+
|
|
253
|
+
@abstractmethod
|
|
254
|
+
def run(self, json_path: str):
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
@dataclass
|
|
259
|
+
class PermissionsNode(PipelineNode):
|
|
260
|
+
"""
|
|
261
|
+
Encapsulated logic to do operations on permissions related data.
|
|
262
|
+
"""
|
|
263
|
+
|
|
264
|
+
def initialize(self):
|
|
265
|
+
logger.info("Running permissions node to cleanup the permissions folder")
|
|
266
|
+
super().initialize()
|
|
267
|
+
|
|
268
|
+
@abstractmethod
|
|
269
|
+
def run(self):
|
|
270
|
+
pass
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import typing as t
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
|
|
9
|
+
from unstructured_ingest.error import PartitionError
|
|
10
|
+
from unstructured_ingest.logger import logger
|
|
11
|
+
from unstructured_ingest.pipeline.interfaces import PartitionNode
|
|
12
|
+
from unstructured_ingest.pipeline.utils import get_ingest_doc_hash
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Partitioner(PartitionNode):
|
|
17
|
+
@PartitionError.wrap
|
|
18
|
+
def run(self, ingest_doc_dict) -> Optional[str]:
|
|
19
|
+
try:
|
|
20
|
+
doc = create_ingest_doc_from_dict(ingest_doc_dict)
|
|
21
|
+
doc_filename_hash = get_ingest_doc_hash(ingest_doc_dict)
|
|
22
|
+
hashed_filename = hashlib.sha256(
|
|
23
|
+
f"{self.create_hash()}{doc_filename_hash}".encode(),
|
|
24
|
+
).hexdigest()[:32]
|
|
25
|
+
self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_dict
|
|
26
|
+
doc_filename = f"{hashed_filename}.json"
|
|
27
|
+
json_path = (Path(self.get_path()) / doc_filename).resolve()
|
|
28
|
+
if (
|
|
29
|
+
not self.pipeline_context.reprocess
|
|
30
|
+
and json_path.is_file()
|
|
31
|
+
and json_path.stat().st_size
|
|
32
|
+
):
|
|
33
|
+
logger.info(f"file exists: {json_path}, skipping partition")
|
|
34
|
+
return str(json_path)
|
|
35
|
+
partition_kwargs: t.Dict[str, t.Any] = {
|
|
36
|
+
"strategy": self.partition_config.strategy,
|
|
37
|
+
"encoding": self.partition_config.encoding,
|
|
38
|
+
"pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
|
|
39
|
+
"languages": self.partition_config.ocr_languages,
|
|
40
|
+
"hi_res_model_name": self.partition_config.hi_res_model_name,
|
|
41
|
+
}
|
|
42
|
+
if self.partition_config.skip_infer_table_types:
|
|
43
|
+
partition_kwargs["skip_infer_table_types"] = (
|
|
44
|
+
self.partition_config.skip_infer_table_types
|
|
45
|
+
)
|
|
46
|
+
if self.partition_config.additional_partition_args:
|
|
47
|
+
partition_kwargs.update(self.partition_config.additional_partition_args)
|
|
48
|
+
elements = doc.process_file(
|
|
49
|
+
partition_config=self.partition_config,
|
|
50
|
+
**partition_kwargs,
|
|
51
|
+
)
|
|
52
|
+
with open(json_path, "w", encoding="utf8") as output_f:
|
|
53
|
+
logger.info(f"writing partitioned content to {json_path}")
|
|
54
|
+
json.dump(elements, output_f, ensure_ascii=False, indent=2, sort_keys=True)
|
|
55
|
+
return str(json_path)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
if self.pipeline_context.raise_on_error:
|
|
58
|
+
raise
|
|
59
|
+
logger.error(f"failed to partition doc: {ingest_doc_dict}, {e}", exc_info=True)
|
|
60
|
+
return None
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.interfaces import PermissionsCleanupMixin, ProcessorConfig
|
|
4
|
+
from unstructured_ingest.pipeline.interfaces import PermissionsNode
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class PermissionsDataCleaner(PermissionsNode, PermissionsCleanupMixin):
|
|
9
|
+
processor_config: ProcessorConfig
|
|
10
|
+
|
|
11
|
+
def run(self):
|
|
12
|
+
self.cleanup_permissions()
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import multiprocessing as mp
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from dataclasses_json import DataClassJsonMixin
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
|
|
9
|
+
from unstructured_ingest.interfaces import BaseIngestDocBatch, BaseSingleIngestDoc
|
|
10
|
+
from unstructured_ingest.logger import ingest_log_streaming_init, logger
|
|
11
|
+
from unstructured_ingest.pipeline.copy import Copier
|
|
12
|
+
from unstructured_ingest.pipeline.interfaces import (
|
|
13
|
+
DocFactoryNode,
|
|
14
|
+
PartitionNode,
|
|
15
|
+
PipelineContext,
|
|
16
|
+
ReformatNode,
|
|
17
|
+
SourceNode,
|
|
18
|
+
WriteNode,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.pipeline.permissions import PermissionsDataCleaner
|
|
21
|
+
from unstructured_ingest.pipeline.utils import get_ingest_doc_hash
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Pipeline(DataClassJsonMixin):
|
|
26
|
+
pipeline_context: PipelineContext
|
|
27
|
+
doc_factory_node: DocFactoryNode
|
|
28
|
+
source_node: SourceNode
|
|
29
|
+
partition_node: Optional[PartitionNode] = None
|
|
30
|
+
write_node: Optional[WriteNode] = None
|
|
31
|
+
reformat_nodes: "list[ReformatNode]" = field(default_factory=list)
|
|
32
|
+
permissions_node: Optional[PermissionsDataCleaner] = None
|
|
33
|
+
|
|
34
|
+
def initialize(self):
|
|
35
|
+
ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
|
|
36
|
+
|
|
37
|
+
def get_nodes_str(self):
|
|
38
|
+
nodes = [self.doc_factory_node, self.source_node, self.partition_node]
|
|
39
|
+
nodes.extend(self.reformat_nodes)
|
|
40
|
+
if self.write_node:
|
|
41
|
+
nodes.append(self.write_node)
|
|
42
|
+
nodes.append(Copier(pipeline_context=self.pipeline_context))
|
|
43
|
+
return " -> ".join([node.__class__.__name__ for node in nodes])
|
|
44
|
+
|
|
45
|
+
def expand_batch_docs(self, dict_docs: "list[dict[str, Any]]") -> "list[dict[str, Any]]":
|
|
46
|
+
expanded_docs: list[dict[str, Any]] = []
|
|
47
|
+
for d in dict_docs:
|
|
48
|
+
doc = create_ingest_doc_from_dict(d)
|
|
49
|
+
if isinstance(doc, BaseSingleIngestDoc):
|
|
50
|
+
expanded_docs.append(doc.to_dict())
|
|
51
|
+
elif isinstance(doc, BaseIngestDocBatch):
|
|
52
|
+
expanded_docs.extend([single_doc.to_dict() for single_doc in doc.ingest_docs])
|
|
53
|
+
else:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"type of doc ({type(doc)}) is not a recognized type: "
|
|
56
|
+
f"BaseSingleIngestDoc or BaseSingleIngestDoc"
|
|
57
|
+
)
|
|
58
|
+
return expanded_docs
|
|
59
|
+
|
|
60
|
+
def run(self):
|
|
61
|
+
logger.info(
|
|
62
|
+
f"running pipeline: {self.get_nodes_str()} "
|
|
63
|
+
f"with config: {self.pipeline_context.to_json()}",
|
|
64
|
+
)
|
|
65
|
+
self.initialize()
|
|
66
|
+
manager = mp.Manager()
|
|
67
|
+
self.pipeline_context.ingest_docs_map = manager.dict()
|
|
68
|
+
# -- Get the documents to be processed --
|
|
69
|
+
dict_docs = self.doc_factory_node()
|
|
70
|
+
dict_docs = [manager.dict(d) for d in dict_docs]
|
|
71
|
+
if not dict_docs:
|
|
72
|
+
logger.info("no docs found to process")
|
|
73
|
+
return
|
|
74
|
+
logger.info(
|
|
75
|
+
f"processing {len(dict_docs)} docs via "
|
|
76
|
+
f"{self.pipeline_context.num_processes} processes",
|
|
77
|
+
)
|
|
78
|
+
for doc in dict_docs:
|
|
79
|
+
self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc
|
|
80
|
+
fetched_filenames = self.source_node(iterable=dict_docs)
|
|
81
|
+
if self.source_node.read_config.download_only:
|
|
82
|
+
logger.info("stopping pipeline after downloading files")
|
|
83
|
+
return
|
|
84
|
+
if not fetched_filenames:
|
|
85
|
+
logger.info("No files to run partition over")
|
|
86
|
+
return
|
|
87
|
+
# -- To support batches ingest docs, expand those into the populated single ingest
|
|
88
|
+
# -- docs after downloading content
|
|
89
|
+
dict_docs = self.expand_batch_docs(dict_docs=dict_docs)
|
|
90
|
+
if self.partition_node is None:
|
|
91
|
+
raise ValueError("partition node not set")
|
|
92
|
+
partitioned_jsons = self.partition_node(iterable=dict_docs)
|
|
93
|
+
if not partitioned_jsons:
|
|
94
|
+
logger.info("No files to process after partitioning")
|
|
95
|
+
return
|
|
96
|
+
for reformat_node in self.reformat_nodes:
|
|
97
|
+
reformatted_jsons = reformat_node(iterable=partitioned_jsons)
|
|
98
|
+
if not reformatted_jsons:
|
|
99
|
+
logger.info(f"no files to process after {reformat_node.__class__.__name__}")
|
|
100
|
+
return
|
|
101
|
+
partitioned_jsons = reformatted_jsons
|
|
102
|
+
|
|
103
|
+
# -- Copy the final destination to the desired location --
|
|
104
|
+
copier = Copier(
|
|
105
|
+
pipeline_context=self.pipeline_context,
|
|
106
|
+
)
|
|
107
|
+
copier(iterable=partitioned_jsons)
|
|
108
|
+
|
|
109
|
+
if self.write_node:
|
|
110
|
+
logger.info(
|
|
111
|
+
f"uploading elements from {len(partitioned_jsons)} "
|
|
112
|
+
"document(s) to the destination"
|
|
113
|
+
)
|
|
114
|
+
self.write_node(iterable=partitioned_jsons)
|
|
115
|
+
|
|
116
|
+
if self.permissions_node:
|
|
117
|
+
self.permissions_node.cleanup_permissions()
|
|
File without changes
|