unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import typing as t
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
9
|
+
from unstructured_ingest.interfaces import (
|
|
10
|
+
AccessConfig,
|
|
11
|
+
BaseConnectorConfig,
|
|
12
|
+
BaseSingleIngestDoc,
|
|
13
|
+
BaseSourceConnector,
|
|
14
|
+
IngestDocCleanupMixin,
|
|
15
|
+
SourceConnectorCleanupMixin,
|
|
16
|
+
SourceMetadata,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.logger import logger
|
|
19
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
20
|
+
|
|
21
|
+
if t.TYPE_CHECKING:
|
|
22
|
+
from atlassian import Confluence
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class ConfluenceAccessConfig(AccessConfig):
|
|
27
|
+
api_token: str = enhanced_field(sensitive=True)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class SimpleConfluenceConfig(BaseConnectorConfig):
|
|
32
|
+
"""Connector config where:
|
|
33
|
+
user_email is the email to authenticate into Confluence Cloud,
|
|
34
|
+
api_token is the api token to authenticate into Confluence Cloud,
|
|
35
|
+
and url is the URL pointing to the Confluence Cloud instance.
|
|
36
|
+
|
|
37
|
+
Check https://developer.atlassian.com/cloud/confluence/basic-auth-for-rest-apis/
|
|
38
|
+
for more info on the api_token.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
user_email: str
|
|
42
|
+
access_config: ConfluenceAccessConfig
|
|
43
|
+
url: str
|
|
44
|
+
max_num_of_spaces: int = 500
|
|
45
|
+
max_num_of_docs_from_each_space: int = 100
|
|
46
|
+
spaces: t.List[str] = field(default_factory=list)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class ConfluenceDocumentMeta:
|
|
51
|
+
"""Metadata specifying:
|
|
52
|
+
id for the confluence space that the document locates in,
|
|
53
|
+
and the id of document that is being reached to.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
space_id: str
|
|
57
|
+
document_id: str
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def scroll_wrapper(func):
|
|
61
|
+
def wrapper(*args, **kwargs):
|
|
62
|
+
"""Wraps a function to obtain scroll functionality."""
|
|
63
|
+
number_of_items_to_fetch = kwargs["number_of_items_to_fetch"]
|
|
64
|
+
del kwargs["number_of_items_to_fetch"]
|
|
65
|
+
|
|
66
|
+
kwargs["limit"] = min(100, number_of_items_to_fetch)
|
|
67
|
+
kwargs["start"] = kwargs.get("start", 0)
|
|
68
|
+
|
|
69
|
+
all_results = []
|
|
70
|
+
num_iterations = math.ceil(number_of_items_to_fetch / kwargs["limit"])
|
|
71
|
+
|
|
72
|
+
for _ in range(num_iterations):
|
|
73
|
+
response = func(*args, **kwargs)
|
|
74
|
+
if isinstance(response, list):
|
|
75
|
+
all_results += func(*args, **kwargs)
|
|
76
|
+
elif isinstance(response, dict):
|
|
77
|
+
all_results += func(*args, **kwargs)["results"]
|
|
78
|
+
|
|
79
|
+
kwargs["start"] += kwargs["limit"]
|
|
80
|
+
|
|
81
|
+
return all_results[:number_of_items_to_fetch]
|
|
82
|
+
|
|
83
|
+
return wrapper
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class ConfluenceIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
88
|
+
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
89
|
+
doing the processing).
|
|
90
|
+
|
|
91
|
+
Current implementation creates a Confluence connection object
|
|
92
|
+
to fetch each doc, rather than creating a it for each thread.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
connector_config: SimpleConfluenceConfig
|
|
96
|
+
document_meta: ConfluenceDocumentMeta
|
|
97
|
+
registry_name: str = "confluence"
|
|
98
|
+
|
|
99
|
+
# TODO: remove one of filename or _tmp_download_file, using a wrapper
|
|
100
|
+
@property
|
|
101
|
+
def filename(self):
|
|
102
|
+
if not self.read_config.download_dir:
|
|
103
|
+
return None
|
|
104
|
+
return (
|
|
105
|
+
Path(self.read_config.download_dir)
|
|
106
|
+
/ self.document_meta.space_id
|
|
107
|
+
/ f"{self.document_meta.document_id}.html"
|
|
108
|
+
).resolve()
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def _output_filename(self):
|
|
112
|
+
"""Create output file path based on output directory, space id and document id."""
|
|
113
|
+
output_file = f"{self.document_meta.document_id}.json"
|
|
114
|
+
return Path(self.processor_config.output_dir) / self.document_meta.space_id / output_file
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
118
|
+
return {
|
|
119
|
+
"url": self.connector_config.url,
|
|
120
|
+
"page_id": self.document_meta.document_id,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
@SourceConnectionNetworkError.wrap
|
|
124
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
125
|
+
def _get_page(self):
|
|
126
|
+
from atlassian import Confluence
|
|
127
|
+
from atlassian.errors import ApiError
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
confluence = Confluence(
|
|
131
|
+
self.connector_config.url,
|
|
132
|
+
username=self.connector_config.user_email,
|
|
133
|
+
password=self.connector_config.access_config.api_token,
|
|
134
|
+
)
|
|
135
|
+
result = confluence.get_page_by_id(
|
|
136
|
+
page_id=self.document_meta.document_id,
|
|
137
|
+
expand="history.lastUpdated,version,body.view",
|
|
138
|
+
)
|
|
139
|
+
except ApiError as e:
|
|
140
|
+
logger.error(e)
|
|
141
|
+
return None
|
|
142
|
+
return result
|
|
143
|
+
|
|
144
|
+
def update_source_metadata(self, **kwargs):
|
|
145
|
+
"""Fetches file metadata from the current page."""
|
|
146
|
+
page = kwargs.get("page", self._get_page())
|
|
147
|
+
if page is None:
|
|
148
|
+
self.source_metadata = SourceMetadata(
|
|
149
|
+
exists=False,
|
|
150
|
+
)
|
|
151
|
+
return
|
|
152
|
+
document_history = page["history"]
|
|
153
|
+
date_created = datetime.strptime(
|
|
154
|
+
document_history["createdDate"],
|
|
155
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
156
|
+
).isoformat()
|
|
157
|
+
if last_updated := document_history.get("lastUpdated", {}).get("when", ""):
|
|
158
|
+
date_modified = datetime.strptime(
|
|
159
|
+
last_updated,
|
|
160
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
161
|
+
).isoformat()
|
|
162
|
+
else:
|
|
163
|
+
date_modified = date_created
|
|
164
|
+
version = page["version"]["number"]
|
|
165
|
+
self.source_metadata = SourceMetadata(
|
|
166
|
+
date_created=date_created,
|
|
167
|
+
date_modified=date_modified,
|
|
168
|
+
version=version,
|
|
169
|
+
source_url=page["_links"].get("self", None),
|
|
170
|
+
exists=True,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
@SourceConnectionError.wrap
|
|
174
|
+
@requires_dependencies(["atlassian"], extras="confluence")
|
|
175
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
176
|
+
def get_file(self):
|
|
177
|
+
# TODO: instead of having a separate connection object for each doc,
|
|
178
|
+
# have a separate connection object for each process
|
|
179
|
+
|
|
180
|
+
result = self._get_page()
|
|
181
|
+
self.update_source_metadata(page=result)
|
|
182
|
+
if result is None:
|
|
183
|
+
raise ValueError(f"Failed to retrieve page with ID {self.document_meta.document_id}")
|
|
184
|
+
self.document = result["body"]["view"]["value"]
|
|
185
|
+
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
186
|
+
with open(self.filename, "w", encoding="utf8") as f:
|
|
187
|
+
f.write(self.document)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@dataclass
|
|
191
|
+
class ConfluenceSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
192
|
+
"""Fetches body fields from all documents within all spaces in a Confluence Cloud instance."""
|
|
193
|
+
|
|
194
|
+
connector_config: SimpleConfluenceConfig
|
|
195
|
+
_confluence: t.Optional["Confluence"] = field(init=False, default=None)
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def confluence(self) -> "Confluence":
|
|
199
|
+
from atlassian import Confluence
|
|
200
|
+
|
|
201
|
+
if self._confluence is None:
|
|
202
|
+
self._confluence = Confluence(
|
|
203
|
+
url=self.connector_config.url,
|
|
204
|
+
username=self.connector_config.user_email,
|
|
205
|
+
password=self.connector_config.access_config.api_token,
|
|
206
|
+
)
|
|
207
|
+
return self._confluence
|
|
208
|
+
|
|
209
|
+
@requires_dependencies(["atlassian", "requests"], extras="Confluence")
|
|
210
|
+
def check_connection(self):
|
|
211
|
+
import requests
|
|
212
|
+
|
|
213
|
+
url = "rest/api/space"
|
|
214
|
+
try:
|
|
215
|
+
self.confluence.request(method="HEAD", path=url)
|
|
216
|
+
except requests.HTTPError as http_error:
|
|
217
|
+
logger.error(f"failed to validate connection: {http_error}", exc_info=True)
|
|
218
|
+
raise SourceConnectionError(f"failed to validate connection: {http_error}")
|
|
219
|
+
|
|
220
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
221
|
+
def initialize(self):
|
|
222
|
+
self.list_of_spaces = None
|
|
223
|
+
if self.connector_config.spaces:
|
|
224
|
+
self.list_of_spaces = self.connector_config.spaces
|
|
225
|
+
if self.connector_config.max_num_of_spaces:
|
|
226
|
+
logger.warning(
|
|
227
|
+
"""--confluence-list-of-spaces and --confluence-num-of-spaces cannot
|
|
228
|
+
be used at the same time. Connector will only fetch the
|
|
229
|
+
--confluence-list-of-spaces that you've provided.""",
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
233
|
+
def _get_space_ids(self):
|
|
234
|
+
"""Fetches spaces in a confluence domain."""
|
|
235
|
+
|
|
236
|
+
get_spaces_with_scroll = scroll_wrapper(self.confluence.get_all_spaces)
|
|
237
|
+
|
|
238
|
+
all_results = get_spaces_with_scroll(
|
|
239
|
+
number_of_items_to_fetch=self.connector_config.max_num_of_spaces,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
space_ids = [space["key"] for space in all_results]
|
|
243
|
+
return space_ids
|
|
244
|
+
|
|
245
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
246
|
+
def _get_docs_ids_within_one_space(
|
|
247
|
+
self,
|
|
248
|
+
space_id: str,
|
|
249
|
+
content_type: str = "page",
|
|
250
|
+
):
|
|
251
|
+
get_pages_with_scroll = scroll_wrapper(self.confluence.get_all_pages_from_space)
|
|
252
|
+
results = get_pages_with_scroll(
|
|
253
|
+
space=space_id,
|
|
254
|
+
number_of_items_to_fetch=self.connector_config.max_num_of_docs_from_each_space,
|
|
255
|
+
content_type=content_type,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
doc_ids = [(space_id, doc["id"]) for doc in results]
|
|
259
|
+
return doc_ids
|
|
260
|
+
|
|
261
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
262
|
+
def _get_doc_ids_within_spaces(self):
|
|
263
|
+
space_ids = self._get_space_ids() if not self.list_of_spaces else self.list_of_spaces
|
|
264
|
+
|
|
265
|
+
doc_ids_all = [self._get_docs_ids_within_one_space(space_id=id) for id in space_ids]
|
|
266
|
+
|
|
267
|
+
doc_ids_flattened = [
|
|
268
|
+
(space_id, doc_id)
|
|
269
|
+
for doc_ids_space in doc_ids_all
|
|
270
|
+
for space_id, doc_id in doc_ids_space
|
|
271
|
+
]
|
|
272
|
+
return doc_ids_flattened
|
|
273
|
+
|
|
274
|
+
def get_ingest_docs(self):
|
|
275
|
+
"""Fetches all documents in a confluence space."""
|
|
276
|
+
doc_ids = self._get_doc_ids_within_spaces()
|
|
277
|
+
return [
|
|
278
|
+
ConfluenceIngestDoc(
|
|
279
|
+
connector_config=self.connector_config,
|
|
280
|
+
processor_config=self.processor_config,
|
|
281
|
+
read_config=self.read_config,
|
|
282
|
+
document_meta=ConfluenceDocumentMeta(space_id, doc_id),
|
|
283
|
+
)
|
|
284
|
+
for space_id, doc_id in doc_ids
|
|
285
|
+
]
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import typing as t
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from io import BytesIO
|
|
7
|
+
from pathlib import PurePath
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
+
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
12
|
+
from unstructured_ingest.interfaces import (
|
|
13
|
+
AccessConfig,
|
|
14
|
+
BaseConnectorConfig,
|
|
15
|
+
BaseDestinationConnector,
|
|
16
|
+
BaseSingleIngestDoc,
|
|
17
|
+
WriteConfig,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.logger import logger
|
|
20
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
|
+
|
|
22
|
+
if t.TYPE_CHECKING:
|
|
23
|
+
from databricks.sdk import WorkspaceClient
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class DatabricksVolumesAccessConfig(AccessConfig):
|
|
28
|
+
account_id: t.Optional[str] = None
|
|
29
|
+
username: t.Optional[str] = None
|
|
30
|
+
password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
31
|
+
client_id: t.Optional[str] = None
|
|
32
|
+
client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
33
|
+
token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
34
|
+
profile: t.Optional[str] = None
|
|
35
|
+
azure_workspace_resource_id: t.Optional[str] = None
|
|
36
|
+
azure_client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
37
|
+
azure_client_id: t.Optional[str] = None
|
|
38
|
+
azure_tenant_id: t.Optional[str] = None
|
|
39
|
+
azure_environment: t.Optional[str] = None
|
|
40
|
+
auth_type: t.Optional[str] = None
|
|
41
|
+
cluster_id: t.Optional[str] = None
|
|
42
|
+
google_credentials: t.Optional[str] = None
|
|
43
|
+
google_service_account: t.Optional[str] = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class SimpleDatabricksVolumesConfig(BaseConnectorConfig):
|
|
48
|
+
access_config: DatabricksVolumesAccessConfig
|
|
49
|
+
host: t.Optional[str] = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class DatabricksVolumesWriteConfig(WriteConfig):
|
|
54
|
+
volume: str
|
|
55
|
+
catalog: str
|
|
56
|
+
volume_path: t.Optional[str] = None
|
|
57
|
+
overwrite: bool = False
|
|
58
|
+
encoding: str = "utf-8"
|
|
59
|
+
schema: str = "default"
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def path(self) -> str:
|
|
63
|
+
path = f"/Volumes/{self.catalog}/{self.schema}/{self.volume}"
|
|
64
|
+
if self.volume_path:
|
|
65
|
+
path = f"{path}/{self.volume_path}"
|
|
66
|
+
return path
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class DatabricksVolumesDestinationConnector(BaseDestinationConnector):
|
|
71
|
+
write_config: DatabricksVolumesWriteConfig
|
|
72
|
+
connector_config: SimpleDatabricksVolumesConfig
|
|
73
|
+
_client: t.Optional["WorkspaceClient"] = field(init=False, default=None)
|
|
74
|
+
|
|
75
|
+
def to_dict(self, **kwargs):
|
|
76
|
+
self_cp = copy.copy(self)
|
|
77
|
+
if hasattr(self_cp, "_client"):
|
|
78
|
+
setattr(self_cp, "_client", None)
|
|
79
|
+
return _asdict(self_cp, **kwargs)
|
|
80
|
+
|
|
81
|
+
@requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
|
|
82
|
+
def generate_client(self) -> "WorkspaceClient":
|
|
83
|
+
from databricks.sdk import WorkspaceClient
|
|
84
|
+
|
|
85
|
+
return WorkspaceClient(
|
|
86
|
+
host=self.connector_config.host, **self.connector_config.access_config.to_dict()
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def client(self) -> "WorkspaceClient":
|
|
91
|
+
if self._client is None:
|
|
92
|
+
self._client = self.generate_client()
|
|
93
|
+
return self._client
|
|
94
|
+
|
|
95
|
+
def check_connection(self):
|
|
96
|
+
try:
|
|
97
|
+
assert self.client.current_user.me().active
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
100
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
101
|
+
|
|
102
|
+
def initialize(self):
|
|
103
|
+
_ = self.client
|
|
104
|
+
|
|
105
|
+
def write_dict(
|
|
106
|
+
self,
|
|
107
|
+
*args,
|
|
108
|
+
elements_dict: t.List[t.Dict[str, t.Any]],
|
|
109
|
+
filename: t.Optional[str] = None,
|
|
110
|
+
indent: int = 4,
|
|
111
|
+
encoding: str = "utf-8",
|
|
112
|
+
**kwargs,
|
|
113
|
+
) -> None:
|
|
114
|
+
output_folder = self.write_config.path
|
|
115
|
+
output_folder = os.path.join(output_folder) # Make sure folder ends with file separator
|
|
116
|
+
filename = (
|
|
117
|
+
filename.strip(os.sep) if filename else filename
|
|
118
|
+
) # Make sure filename doesn't begin with file separator
|
|
119
|
+
output_path = str(PurePath(output_folder, filename)) if filename else output_folder
|
|
120
|
+
logger.debug(f"uploading content to {output_path}")
|
|
121
|
+
self.client.files.upload(
|
|
122
|
+
file_path=output_path,
|
|
123
|
+
contents=BytesIO(json.dumps(elements_dict).encode(encoding=self.write_config.encoding)),
|
|
124
|
+
overwrite=self.write_config.overwrite,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def get_elements_dict(self, docs: t.List[BaseSingleIngestDoc]) -> t.List[t.Dict[str, t.Any]]:
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
def write(self, docs: t.List[BaseSingleIngestDoc]) -> None:
|
|
131
|
+
for doc in docs:
|
|
132
|
+
file_path = doc.base_output_filename
|
|
133
|
+
filename = file_path if file_path else None
|
|
134
|
+
with open(doc._output_filename) as json_file:
|
|
135
|
+
logger.debug(f"uploading content from {doc._output_filename}")
|
|
136
|
+
json_list = json.load(json_file)
|
|
137
|
+
self.write_dict(elements_dict=json_list, filename=filename)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import typing as t
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime as dt
|
|
5
|
+
from multiprocessing import Process
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
9
|
+
from unstructured_ingest.interfaces import (
|
|
10
|
+
BaseConnectorConfig,
|
|
11
|
+
BaseDestinationConnector,
|
|
12
|
+
BaseSingleIngestDoc,
|
|
13
|
+
BaseSourceConnector,
|
|
14
|
+
IngestDocCleanupMixin,
|
|
15
|
+
SourceConnectorCleanupMixin,
|
|
16
|
+
SourceMetadata,
|
|
17
|
+
WriteConfig,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.logger import logger
|
|
20
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
|
+
|
|
22
|
+
if t.TYPE_CHECKING:
|
|
23
|
+
from deltalake import DeltaTable
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class SimpleDeltaTableConfig(BaseConnectorConfig):
|
|
28
|
+
table_uri: t.Union[str, Path]
|
|
29
|
+
version: t.Optional[int] = None
|
|
30
|
+
storage_options: t.Optional[t.Dict[str, str]] = None
|
|
31
|
+
without_files: bool = False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class DeltaTableIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
36
|
+
connector_config: SimpleDeltaTableConfig
|
|
37
|
+
uri: str
|
|
38
|
+
modified_date: str
|
|
39
|
+
created_at: str
|
|
40
|
+
registry_name: str = "delta-table"
|
|
41
|
+
|
|
42
|
+
def uri_filename(self) -> str:
|
|
43
|
+
basename = os.path.basename(self.uri)
|
|
44
|
+
return os.path.splitext(basename)[0]
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def filename(self):
|
|
48
|
+
return (Path(self.read_config.download_dir) / f"{self.uri_filename()}.csv").resolve()
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def _output_filename(self):
|
|
52
|
+
"""Create filename document id combined with a hash of the query to uniquely identify
|
|
53
|
+
the output file."""
|
|
54
|
+
return Path(self.processor_config.output_dir) / f"{self.uri_filename()}.json"
|
|
55
|
+
|
|
56
|
+
def _create_full_tmp_dir_path(self):
|
|
57
|
+
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
self._output_filename.parent.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
|
|
60
|
+
@requires_dependencies(["fsspec"], extras="delta-table")
|
|
61
|
+
def _get_fs_from_uri(self):
|
|
62
|
+
from fsspec.core import url_to_fs
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
fs, _ = url_to_fs(self.uri)
|
|
66
|
+
except ImportError as error:
|
|
67
|
+
raise ImportError(
|
|
68
|
+
f"uri {self.uri} may be associated with a filesystem that "
|
|
69
|
+
f"requires additional dependencies: {error}",
|
|
70
|
+
)
|
|
71
|
+
return fs
|
|
72
|
+
|
|
73
|
+
def update_source_metadata(self, **kwargs):
|
|
74
|
+
fs = kwargs.get("fs", self._get_fs_from_uri())
|
|
75
|
+
version = (
|
|
76
|
+
fs.checksum(self.uri) if fs.protocol != "gs" else fs.info(self.uri).get("etag", "")
|
|
77
|
+
)
|
|
78
|
+
file_exists = fs.exists(self.uri)
|
|
79
|
+
self.source_metadata = SourceMetadata(
|
|
80
|
+
date_created=self.created_at,
|
|
81
|
+
date_modified=self.modified_date,
|
|
82
|
+
version=version,
|
|
83
|
+
source_url=self.uri,
|
|
84
|
+
exists=file_exists,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
@SourceConnectionError.wrap
|
|
88
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
89
|
+
def get_file(self):
|
|
90
|
+
fs = self._get_fs_from_uri()
|
|
91
|
+
self.update_source_metadata(fs=fs)
|
|
92
|
+
logger.info(f"using a {fs} filesystem to collect table data")
|
|
93
|
+
self._create_full_tmp_dir_path()
|
|
94
|
+
|
|
95
|
+
df = self._get_df(filesystem=fs)
|
|
96
|
+
|
|
97
|
+
logger.info(f"writing {len(df)} rows to {self.filename}")
|
|
98
|
+
df.to_csv(self.filename)
|
|
99
|
+
|
|
100
|
+
@SourceConnectionNetworkError.wrap
|
|
101
|
+
def _get_df(self, filesystem):
|
|
102
|
+
import pyarrow.parquet as pq
|
|
103
|
+
|
|
104
|
+
return pq.ParquetDataset(self.uri, filesystem=filesystem).read_pandas().to_pandas()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class DeltaTableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
109
|
+
connector_config: SimpleDeltaTableConfig
|
|
110
|
+
delta_table: t.Optional["DeltaTable"] = None
|
|
111
|
+
|
|
112
|
+
def check_connection(self):
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
116
|
+
def initialize(self):
|
|
117
|
+
from deltalake import DeltaTable
|
|
118
|
+
|
|
119
|
+
self.delta_table = DeltaTable(
|
|
120
|
+
table_uri=self.connector_config.table_uri,
|
|
121
|
+
version=self.connector_config.version,
|
|
122
|
+
storage_options=self.connector_config.storage_options,
|
|
123
|
+
without_files=self.connector_config.without_files,
|
|
124
|
+
)
|
|
125
|
+
rows = self.delta_table.to_pyarrow_dataset().count_rows()
|
|
126
|
+
if not rows > 0:
|
|
127
|
+
raise ValueError(f"no data found at {self.connector_config.table_uri}")
|
|
128
|
+
logger.info(f"processing {rows} rows of data")
|
|
129
|
+
|
|
130
|
+
def get_ingest_docs(self):
|
|
131
|
+
"""Batches the results into distinct docs"""
|
|
132
|
+
if not self.delta_table:
|
|
133
|
+
raise ValueError("delta table was never initialized")
|
|
134
|
+
actions = self.delta_table.get_add_actions().to_pandas()
|
|
135
|
+
mod_date_dict = {
|
|
136
|
+
row["path"]: str(row["modification_time"]) for _, row in actions.iterrows()
|
|
137
|
+
}
|
|
138
|
+
created_at = dt.fromtimestamp(self.delta_table.metadata().created_time / 1000)
|
|
139
|
+
return [
|
|
140
|
+
DeltaTableIngestDoc(
|
|
141
|
+
connector_config=self.connector_config,
|
|
142
|
+
processor_config=self.processor_config,
|
|
143
|
+
read_config=self.read_config,
|
|
144
|
+
uri=uri,
|
|
145
|
+
modified_date=mod_date_dict[os.path.basename(uri)],
|
|
146
|
+
created_at=str(created_at),
|
|
147
|
+
)
|
|
148
|
+
for uri in self.delta_table.file_uris()
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@dataclass
|
|
153
|
+
class DeltaTableWriteConfig(WriteConfig):
|
|
154
|
+
drop_empty_cols: bool = False
|
|
155
|
+
mode: t.Literal["error", "append", "overwrite", "ignore"] = "error"
|
|
156
|
+
schema_mode: t.Optional[t.Literal["merge", "overwrite"]] = None
|
|
157
|
+
engine: t.Literal["pyarrow", "rust"] = "pyarrow"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class DeltaTableDestinationConnector(BaseDestinationConnector):
|
|
162
|
+
write_config: DeltaTableWriteConfig
|
|
163
|
+
connector_config: SimpleDeltaTableConfig
|
|
164
|
+
|
|
165
|
+
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
166
|
+
def initialize(self):
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
def check_connection(self):
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
173
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
174
|
+
from deltalake.writer import write_deltalake
|
|
175
|
+
|
|
176
|
+
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
177
|
+
|
|
178
|
+
df = convert_to_pandas_dataframe(
|
|
179
|
+
elements_dict=elements_dict,
|
|
180
|
+
drop_empty_cols=self.write_config.drop_empty_cols,
|
|
181
|
+
)
|
|
182
|
+
logger.info(
|
|
183
|
+
f"writing {len(df)} rows to destination table "
|
|
184
|
+
f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}",
|
|
185
|
+
)
|
|
186
|
+
writer_kwargs = {
|
|
187
|
+
"table_or_uri": self.connector_config.table_uri,
|
|
188
|
+
"data": df,
|
|
189
|
+
"mode": self.write_config.mode,
|
|
190
|
+
"engine": self.write_config.engine,
|
|
191
|
+
}
|
|
192
|
+
if self.write_config.schema_mode is not None:
|
|
193
|
+
writer_kwargs["schema_mode"] = self.write_config.schema_mode
|
|
194
|
+
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
|
195
|
+
# ingest to fail, even though all tasks are completed normally. Putting the writer into a
|
|
196
|
+
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's
|
|
197
|
+
# rust backend to finish
|
|
198
|
+
writer = Process(
|
|
199
|
+
target=write_deltalake,
|
|
200
|
+
kwargs=writer_kwargs,
|
|
201
|
+
)
|
|
202
|
+
writer.start()
|
|
203
|
+
writer.join()
|