unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import typing as t
|
|
3
|
+
import uuid
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
7
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
8
|
+
from unstructured_ingest.interfaces import (
|
|
9
|
+
AccessConfig,
|
|
10
|
+
BaseConnectorConfig,
|
|
11
|
+
BaseDestinationConnector,
|
|
12
|
+
WriteConfig,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
16
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
+
|
|
18
|
+
if t.TYPE_CHECKING:
|
|
19
|
+
from chromadb import Collection as ChromaCollection
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ChromaAccessConfig(AccessConfig):
|
|
24
|
+
settings: t.Optional[t.Dict[str, str]] = None
|
|
25
|
+
headers: t.Optional[t.Dict[str, str]] = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class SimpleChromaConfig(BaseConnectorConfig):
|
|
30
|
+
access_config: ChromaAccessConfig
|
|
31
|
+
collection_name: str
|
|
32
|
+
path: t.Optional[str] = None
|
|
33
|
+
tenant: t.Optional[str] = "default_tenant"
|
|
34
|
+
database: t.Optional[str] = "default_database"
|
|
35
|
+
host: t.Optional[str] = None
|
|
36
|
+
port: t.Optional[int] = None
|
|
37
|
+
ssl: bool = False
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class ChromaWriteConfig(WriteConfig):
|
|
42
|
+
batch_size: int = 100
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ChromaDestinationConnector(BaseDestinationConnector):
|
|
47
|
+
write_config: ChromaWriteConfig
|
|
48
|
+
connector_config: SimpleChromaConfig
|
|
49
|
+
_collection: t.Optional["ChromaCollection"] = None
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def chroma_collection(self):
|
|
53
|
+
if self._collection is None:
|
|
54
|
+
self._collection = self.create_collection()
|
|
55
|
+
return self._collection
|
|
56
|
+
|
|
57
|
+
def initialize(self):
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
@DestinationConnectionError.wrap
|
|
61
|
+
def check_connection(self):
|
|
62
|
+
_ = self.chroma_collection
|
|
63
|
+
|
|
64
|
+
def to_dict(self, **kwargs):
|
|
65
|
+
"""
|
|
66
|
+
The _collection variable in this dataclass breaks deepcopy due to:
|
|
67
|
+
TypeError: cannot pickle 'module' object
|
|
68
|
+
When serializing, remove it, meaning collection data will need to be reinitialized
|
|
69
|
+
when deserialized
|
|
70
|
+
"""
|
|
71
|
+
self_cp = copy.copy(self)
|
|
72
|
+
if hasattr(self_cp, "_collection"):
|
|
73
|
+
setattr(self_cp, "_collection", None)
|
|
74
|
+
return _asdict(self_cp, **kwargs)
|
|
75
|
+
|
|
76
|
+
@requires_dependencies(["chromadb"], extras="chroma")
|
|
77
|
+
def create_collection(self) -> "ChromaCollection":
|
|
78
|
+
import chromadb
|
|
79
|
+
|
|
80
|
+
if self.connector_config.path:
|
|
81
|
+
chroma_client = chromadb.PersistentClient(
|
|
82
|
+
path=self.connector_config.path,
|
|
83
|
+
settings=self.connector_config.settings,
|
|
84
|
+
tenant=self.connector_config.tenant,
|
|
85
|
+
database=self.connector_config.database,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
elif self.connector_config.host and self.connector_config.port:
|
|
89
|
+
chroma_client = chromadb.HttpClient(
|
|
90
|
+
host=self.connector_config.host,
|
|
91
|
+
port=self.connector_config.port,
|
|
92
|
+
ssl=self.connector_config.ssl,
|
|
93
|
+
headers=self.connector_config.access_config.headers,
|
|
94
|
+
settings=self.connector_config.access_config.settings,
|
|
95
|
+
tenant=self.connector_config.tenant,
|
|
96
|
+
database=self.connector_config.database,
|
|
97
|
+
)
|
|
98
|
+
else:
|
|
99
|
+
raise ValueError("Chroma connector requires either path or host and port to be set.")
|
|
100
|
+
|
|
101
|
+
collection = chroma_client.get_or_create_collection(
|
|
102
|
+
name=self.connector_config.collection_name
|
|
103
|
+
)
|
|
104
|
+
return collection
|
|
105
|
+
|
|
106
|
+
@DestinationConnectionError.wrap
|
|
107
|
+
@requires_dependencies(["chromadb"], extras="chroma")
|
|
108
|
+
def upsert_batch(self, batch):
|
|
109
|
+
collection = self.chroma_collection
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
# Chroma wants lists even if there is only one element
|
|
113
|
+
# Upserting to prevent duplicates
|
|
114
|
+
collection.upsert(
|
|
115
|
+
ids=batch["ids"],
|
|
116
|
+
documents=batch["documents"],
|
|
117
|
+
embeddings=batch["embeddings"],
|
|
118
|
+
metadatas=batch["metadatas"],
|
|
119
|
+
)
|
|
120
|
+
except Exception as e:
|
|
121
|
+
raise ValueError(f"chroma error: {e}") from e
|
|
122
|
+
|
|
123
|
+
@staticmethod
|
|
124
|
+
def prepare_chroma_list(chunk: t.Tuple[t.Dict[str, t.Any]]) -> t.Dict[str, t.List[t.Any]]:
|
|
125
|
+
"""Helper function to break a tuple of dicts into list of parallel lists for ChromaDb.
|
|
126
|
+
({'id':1}, {'id':2}, {'id':3}) -> {'ids':[1,2,3]}"""
|
|
127
|
+
chroma_dict = {}
|
|
128
|
+
chroma_dict["ids"] = [x.get("id") for x in chunk]
|
|
129
|
+
chroma_dict["documents"] = [x.get("document") for x in chunk]
|
|
130
|
+
chroma_dict["embeddings"] = [x.get("embedding") for x in chunk]
|
|
131
|
+
chroma_dict["metadatas"] = [x.get("metadata") for x in chunk]
|
|
132
|
+
# Make sure all lists are of the same length
|
|
133
|
+
assert (
|
|
134
|
+
len(chroma_dict["ids"])
|
|
135
|
+
== len(chroma_dict["documents"])
|
|
136
|
+
== len(chroma_dict["embeddings"])
|
|
137
|
+
== len(chroma_dict["metadatas"])
|
|
138
|
+
)
|
|
139
|
+
return chroma_dict
|
|
140
|
+
|
|
141
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
142
|
+
logger.info(f"Inserting / updating {len(elements_dict)} documents to destination ")
|
|
143
|
+
|
|
144
|
+
chroma_batch_size = self.write_config.batch_size
|
|
145
|
+
|
|
146
|
+
for chunk in batch_generator(elements_dict, chroma_batch_size):
|
|
147
|
+
self.upsert_batch(self.prepare_chroma_list(chunk))
|
|
148
|
+
|
|
149
|
+
def normalize_dict(self, element_dict: dict) -> dict:
|
|
150
|
+
element_id = element_dict.get("element_id", str(uuid.uuid4()))
|
|
151
|
+
return {
|
|
152
|
+
"id": element_id,
|
|
153
|
+
"embedding": element_dict.pop("embeddings", None),
|
|
154
|
+
"document": element_dict.pop("text", None),
|
|
155
|
+
"metadata": flatten_dict(
|
|
156
|
+
element_dict, separator="-", flatten_lists=True, remove_none=True
|
|
157
|
+
),
|
|
158
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
import uuid
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
6
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
7
|
+
from unstructured_ingest.interfaces import (
|
|
8
|
+
AccessConfig,
|
|
9
|
+
BaseConnectorConfig,
|
|
10
|
+
BaseDestinationConnector,
|
|
11
|
+
WriteConfig,
|
|
12
|
+
)
|
|
13
|
+
from unstructured_ingest.logger import logger
|
|
14
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
15
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
|
|
17
|
+
if t.TYPE_CHECKING:
|
|
18
|
+
from clarifai.client.input import Inputs
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ClarifaiAccessConfig(AccessConfig):
|
|
23
|
+
api_key: str = enhanced_field(sensitive=True)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class SimpleClarifaiConfig(BaseConnectorConfig):
|
|
28
|
+
access_config: ClarifaiAccessConfig
|
|
29
|
+
app_id: str
|
|
30
|
+
user_id: str
|
|
31
|
+
dataset_id: t.Optional[str] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class ClarifaiWriteConfig(WriteConfig):
|
|
36
|
+
batch_size: int = 50
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ClarifaiDestinationConnector(BaseDestinationConnector):
|
|
41
|
+
write_config: ClarifaiWriteConfig
|
|
42
|
+
connector_config: SimpleClarifaiConfig
|
|
43
|
+
_client: t.Optional["Inputs"] = field(init=False, default=None)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
@requires_dependencies(["clarifai"], extras="clarifai")
|
|
47
|
+
def client(self) -> "Inputs":
|
|
48
|
+
if self._client is None:
|
|
49
|
+
from clarifai.client.input import Inputs
|
|
50
|
+
|
|
51
|
+
access_conf = self.connector_config.access_config
|
|
52
|
+
try:
|
|
53
|
+
if access_conf.api_key is not None:
|
|
54
|
+
clarifai_pat = access_conf.api_key
|
|
55
|
+
except Exception as e:
|
|
56
|
+
raise (f"please provide clarifai PAT key : {e}")
|
|
57
|
+
|
|
58
|
+
self._client = Inputs(
|
|
59
|
+
app_id=self.connector_config.app_id,
|
|
60
|
+
user_id=self.connector_config.user_id,
|
|
61
|
+
pat=clarifai_pat,
|
|
62
|
+
)
|
|
63
|
+
return self._client
|
|
64
|
+
|
|
65
|
+
@requires_dependencies(["clarifai"], extras="clarifai")
|
|
66
|
+
@DestinationConnectionError.wrap
|
|
67
|
+
def initialize(self):
|
|
68
|
+
_ = self.client
|
|
69
|
+
|
|
70
|
+
def check_connection(self):
|
|
71
|
+
try:
|
|
72
|
+
_ = [inp for inp in self.client.list_inputs(page_no=1, per_page=1)] # noqa: C416
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
75
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
76
|
+
|
|
77
|
+
def normalize_dict(self, element_dict: dict) -> dict:
|
|
78
|
+
"""Modifying schema of the dict in order to compile with clarifai input formats"""
|
|
79
|
+
return {
|
|
80
|
+
"input_id": str(uuid.uuid4().hex),
|
|
81
|
+
"text": element_dict.pop("text", None),
|
|
82
|
+
"metadata": {
|
|
83
|
+
**flatten_dict(
|
|
84
|
+
element_dict,
|
|
85
|
+
separator="_",
|
|
86
|
+
flatten_lists=True,
|
|
87
|
+
remove_none=True,
|
|
88
|
+
),
|
|
89
|
+
},
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
93
|
+
from google.protobuf.struct_pb2 import Struct
|
|
94
|
+
|
|
95
|
+
logger.info(
|
|
96
|
+
f"writing {len(elements_dict)} objects to destination "
|
|
97
|
+
f"app {self.connector_config.app_id} "
|
|
98
|
+
)
|
|
99
|
+
try:
|
|
100
|
+
batch_size = self.write_config.batch_size
|
|
101
|
+
for idx in range(0, len(elements_dict), batch_size):
|
|
102
|
+
batch_dict = elements_dict[idx : batch_size + idx]
|
|
103
|
+
input_batch = []
|
|
104
|
+
for elem in batch_dict:
|
|
105
|
+
meta_struct = Struct()
|
|
106
|
+
meta_struct.update(elem["metadata"])
|
|
107
|
+
input_batch.append(
|
|
108
|
+
self._client.get_text_input(
|
|
109
|
+
input_id=elem["input_id"],
|
|
110
|
+
raw_text=elem["text"],
|
|
111
|
+
dataset_id=self.connector_config.dataset_id,
|
|
112
|
+
metadata=meta_struct,
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
result_id = self._client.upload_inputs(inputs=input_batch)
|
|
116
|
+
logger.debug(
|
|
117
|
+
f"Input posted successfully into {self.connector_config.app_id}. \
|
|
118
|
+
Result id: {result_id}"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
except Exception as e:
|
|
122
|
+
raise e
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import typing as t
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
11
|
+
from unstructured_ingest.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
BaseConnectorConfig,
|
|
14
|
+
BaseSingleIngestDoc,
|
|
15
|
+
BaseSourceConnector,
|
|
16
|
+
IngestDocCleanupMixin,
|
|
17
|
+
SourceConnectorCleanupMixin,
|
|
18
|
+
SourceMetadata,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.logger import logger
|
|
21
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
22
|
+
|
|
23
|
+
if t.TYPE_CHECKING:
|
|
24
|
+
from atlassian import Confluence
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ConfluenceAccessConfig(AccessConfig):
|
|
29
|
+
api_token: str = enhanced_field(sensitive=True)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class SimpleConfluenceConfig(BaseConnectorConfig):
|
|
34
|
+
"""Connector config where:
|
|
35
|
+
user_email is the email to authenticate into Confluence Cloud,
|
|
36
|
+
api_token is the api token to authenticate into Confluence Cloud,
|
|
37
|
+
and url is the URL pointing to the Confluence Cloud instance.
|
|
38
|
+
|
|
39
|
+
Check https://developer.atlassian.com/cloud/confluence/basic-auth-for-rest-apis/
|
|
40
|
+
for more info on the api_token.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
user_email: str
|
|
44
|
+
access_config: ConfluenceAccessConfig
|
|
45
|
+
url: str
|
|
46
|
+
max_num_of_spaces: int = 500
|
|
47
|
+
max_num_of_docs_from_each_space: int = 100
|
|
48
|
+
spaces: t.List[str] = field(default_factory=list)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class ConfluenceDocumentMeta:
|
|
53
|
+
"""Metadata specifying:
|
|
54
|
+
id for the confluence space that the document locates in,
|
|
55
|
+
and the id of document that is being reached to.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
space_id: str
|
|
59
|
+
document_id: str
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def scroll_wrapper(func):
|
|
63
|
+
def wrapper(*args, **kwargs):
|
|
64
|
+
"""Wraps a function to obtain scroll functionality."""
|
|
65
|
+
number_of_items_to_fetch = kwargs["number_of_items_to_fetch"]
|
|
66
|
+
del kwargs["number_of_items_to_fetch"]
|
|
67
|
+
|
|
68
|
+
kwargs["limit"] = min(100, number_of_items_to_fetch)
|
|
69
|
+
kwargs["start"] = kwargs.get("start", 0)
|
|
70
|
+
|
|
71
|
+
all_results = []
|
|
72
|
+
num_iterations = math.ceil(number_of_items_to_fetch / kwargs["limit"])
|
|
73
|
+
|
|
74
|
+
for _ in range(num_iterations):
|
|
75
|
+
response = func(*args, **kwargs)
|
|
76
|
+
if isinstance(response, list):
|
|
77
|
+
all_results += func(*args, **kwargs)
|
|
78
|
+
elif isinstance(response, dict):
|
|
79
|
+
all_results += func(*args, **kwargs)["results"]
|
|
80
|
+
|
|
81
|
+
kwargs["start"] += kwargs["limit"]
|
|
82
|
+
|
|
83
|
+
return all_results[:number_of_items_to_fetch]
|
|
84
|
+
|
|
85
|
+
return wrapper
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class ConfluenceIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
90
|
+
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
91
|
+
doing the processing).
|
|
92
|
+
|
|
93
|
+
Current implementation creates a Confluence connection object
|
|
94
|
+
to fetch each doc, rather than creating a it for each thread.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
connector_config: SimpleConfluenceConfig
|
|
98
|
+
document_meta: ConfluenceDocumentMeta
|
|
99
|
+
registry_name: str = "confluence"
|
|
100
|
+
|
|
101
|
+
# TODO: remove one of filename or _tmp_download_file, using a wrapper
|
|
102
|
+
@property
|
|
103
|
+
def filename(self):
|
|
104
|
+
if not self.read_config.download_dir:
|
|
105
|
+
return None
|
|
106
|
+
return (
|
|
107
|
+
Path(self.read_config.download_dir)
|
|
108
|
+
/ self.document_meta.space_id
|
|
109
|
+
/ f"{self.document_meta.document_id}.html"
|
|
110
|
+
).resolve()
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def _output_filename(self):
|
|
114
|
+
"""Create output file path based on output directory, space id and document id."""
|
|
115
|
+
output_file = f"{self.document_meta.document_id}.json"
|
|
116
|
+
return Path(self.processor_config.output_dir) / self.document_meta.space_id / output_file
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
120
|
+
return {
|
|
121
|
+
"url": self.connector_config.url,
|
|
122
|
+
"page_id": self.document_meta.document_id,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
@SourceConnectionNetworkError.wrap
|
|
126
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
127
|
+
def _get_page(self):
|
|
128
|
+
from atlassian import Confluence
|
|
129
|
+
from atlassian.errors import ApiError
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
confluence = Confluence(
|
|
133
|
+
self.connector_config.url,
|
|
134
|
+
username=self.connector_config.user_email,
|
|
135
|
+
password=self.connector_config.access_config.api_token,
|
|
136
|
+
)
|
|
137
|
+
result = confluence.get_page_by_id(
|
|
138
|
+
page_id=self.document_meta.document_id,
|
|
139
|
+
expand="history.lastUpdated,version,body.view",
|
|
140
|
+
)
|
|
141
|
+
except ApiError as e:
|
|
142
|
+
logger.error(e)
|
|
143
|
+
return None
|
|
144
|
+
return result
|
|
145
|
+
|
|
146
|
+
def update_source_metadata(self, **kwargs):
|
|
147
|
+
"""Fetches file metadata from the current page."""
|
|
148
|
+
page = kwargs.get("page", self._get_page())
|
|
149
|
+
if page is None:
|
|
150
|
+
self.source_metadata = SourceMetadata(
|
|
151
|
+
exists=False,
|
|
152
|
+
)
|
|
153
|
+
return
|
|
154
|
+
document_history = page["history"]
|
|
155
|
+
date_created = datetime.strptime(
|
|
156
|
+
document_history["createdDate"],
|
|
157
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
158
|
+
).isoformat()
|
|
159
|
+
if last_updated := document_history.get("lastUpdated", {}).get("when", ""):
|
|
160
|
+
date_modified = datetime.strptime(
|
|
161
|
+
last_updated,
|
|
162
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
163
|
+
).isoformat()
|
|
164
|
+
else:
|
|
165
|
+
date_modified = date_created
|
|
166
|
+
version = page["version"]["number"]
|
|
167
|
+
self.source_metadata = SourceMetadata(
|
|
168
|
+
date_created=date_created,
|
|
169
|
+
date_modified=date_modified,
|
|
170
|
+
version=version,
|
|
171
|
+
source_url=page["_links"].get("self", None),
|
|
172
|
+
exists=True,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
@SourceConnectionError.wrap
|
|
176
|
+
@requires_dependencies(["atlassian"], extras="confluence")
|
|
177
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
178
|
+
def get_file(self):
|
|
179
|
+
# TODO: instead of having a separate connection object for each doc,
|
|
180
|
+
# have a separate connection object for each process
|
|
181
|
+
|
|
182
|
+
result = self._get_page()
|
|
183
|
+
self.update_source_metadata(page=result)
|
|
184
|
+
if result is None:
|
|
185
|
+
raise ValueError(f"Failed to retrieve page with ID {self.document_meta.document_id}")
|
|
186
|
+
self.document = result["body"]["view"]["value"]
|
|
187
|
+
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
188
|
+
with open(self.filename, "w", encoding="utf8") as f:
|
|
189
|
+
f.write(self.document)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@dataclass
|
|
193
|
+
class ConfluenceSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
194
|
+
"""Fetches body fields from all documents within all spaces in a Confluence Cloud instance."""
|
|
195
|
+
|
|
196
|
+
connector_config: SimpleConfluenceConfig
|
|
197
|
+
_confluence: t.Optional["Confluence"] = field(init=False, default=None)
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def confluence(self) -> "Confluence":
|
|
201
|
+
from atlassian import Confluence
|
|
202
|
+
|
|
203
|
+
if self._confluence is None:
|
|
204
|
+
self._confluence = Confluence(
|
|
205
|
+
url=self.connector_config.url,
|
|
206
|
+
username=self.connector_config.user_email,
|
|
207
|
+
password=self.connector_config.access_config.api_token,
|
|
208
|
+
)
|
|
209
|
+
return self._confluence
|
|
210
|
+
|
|
211
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
212
|
+
def check_connection(self):
|
|
213
|
+
url = "rest/api/space"
|
|
214
|
+
try:
|
|
215
|
+
self.confluence.request(method="HEAD", path=url)
|
|
216
|
+
except requests.HTTPError as http_error:
|
|
217
|
+
logger.error(f"failed to validate connection: {http_error}", exc_info=True)
|
|
218
|
+
raise SourceConnectionError(f"failed to validate connection: {http_error}")
|
|
219
|
+
|
|
220
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
221
|
+
def initialize(self):
|
|
222
|
+
self.list_of_spaces = None
|
|
223
|
+
if self.connector_config.spaces:
|
|
224
|
+
self.list_of_spaces = self.connector_config.spaces
|
|
225
|
+
if self.connector_config.max_num_of_spaces:
|
|
226
|
+
logger.warning(
|
|
227
|
+
"""--confluence-list-of-spaces and --confluence-num-of-spaces cannot
|
|
228
|
+
be used at the same time. Connector will only fetch the
|
|
229
|
+
--confluence-list-of-spaces that you've provided.""",
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
233
|
+
def _get_space_ids(self):
|
|
234
|
+
"""Fetches spaces in a confluence domain."""
|
|
235
|
+
|
|
236
|
+
get_spaces_with_scroll = scroll_wrapper(self.confluence.get_all_spaces)
|
|
237
|
+
|
|
238
|
+
all_results = get_spaces_with_scroll(
|
|
239
|
+
number_of_items_to_fetch=self.connector_config.max_num_of_spaces,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
space_ids = [space["key"] for space in all_results]
|
|
243
|
+
return space_ids
|
|
244
|
+
|
|
245
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
246
|
+
def _get_docs_ids_within_one_space(
|
|
247
|
+
self,
|
|
248
|
+
space_id: str,
|
|
249
|
+
content_type: str = "page",
|
|
250
|
+
):
|
|
251
|
+
get_pages_with_scroll = scroll_wrapper(self.confluence.get_all_pages_from_space)
|
|
252
|
+
results = get_pages_with_scroll(
|
|
253
|
+
space=space_id,
|
|
254
|
+
number_of_items_to_fetch=self.connector_config.max_num_of_docs_from_each_space,
|
|
255
|
+
content_type=content_type,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
doc_ids = [(space_id, doc["id"]) for doc in results]
|
|
259
|
+
return doc_ids
|
|
260
|
+
|
|
261
|
+
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
262
|
+
def _get_doc_ids_within_spaces(self):
|
|
263
|
+
space_ids = self._get_space_ids() if not self.list_of_spaces else self.list_of_spaces
|
|
264
|
+
|
|
265
|
+
doc_ids_all = [self._get_docs_ids_within_one_space(space_id=id) for id in space_ids]
|
|
266
|
+
|
|
267
|
+
doc_ids_flattened = [
|
|
268
|
+
(space_id, doc_id)
|
|
269
|
+
for doc_ids_space in doc_ids_all
|
|
270
|
+
for space_id, doc_id in doc_ids_space
|
|
271
|
+
]
|
|
272
|
+
return doc_ids_flattened
|
|
273
|
+
|
|
274
|
+
def get_ingest_docs(self):
|
|
275
|
+
"""Fetches all documents in a confluence space."""
|
|
276
|
+
doc_ids = self._get_doc_ids_within_spaces()
|
|
277
|
+
return [
|
|
278
|
+
ConfluenceIngestDoc(
|
|
279
|
+
connector_config=self.connector_config,
|
|
280
|
+
processor_config=self.processor_config,
|
|
281
|
+
read_config=self.read_config,
|
|
282
|
+
document_meta=ConfluenceDocumentMeta(space_id, doc_id),
|
|
283
|
+
)
|
|
284
|
+
for space_id, doc_id in doc_ids
|
|
285
|
+
]
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import typing as t
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from io import BytesIO
|
|
7
|
+
from pathlib import PurePath
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
+
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
12
|
+
from unstructured_ingest.interfaces import (
|
|
13
|
+
AccessConfig,
|
|
14
|
+
BaseConnectorConfig,
|
|
15
|
+
BaseDestinationConnector,
|
|
16
|
+
BaseSingleIngestDoc,
|
|
17
|
+
WriteConfig,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.logger import logger
|
|
20
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
|
+
|
|
22
|
+
if t.TYPE_CHECKING:
|
|
23
|
+
from databricks.sdk import WorkspaceClient
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class DatabricksVolumesAccessConfig(AccessConfig):
|
|
28
|
+
account_id: t.Optional[str] = None
|
|
29
|
+
username: t.Optional[str] = None
|
|
30
|
+
password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
31
|
+
client_id: t.Optional[str] = None
|
|
32
|
+
client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
33
|
+
token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
34
|
+
profile: t.Optional[str] = None
|
|
35
|
+
azure_workspace_resource_id: t.Optional[str] = None
|
|
36
|
+
azure_client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
37
|
+
azure_client_id: t.Optional[str] = None
|
|
38
|
+
azure_tenant_id: t.Optional[str] = None
|
|
39
|
+
azure_environment: t.Optional[str] = None
|
|
40
|
+
auth_type: t.Optional[str] = None
|
|
41
|
+
cluster_id: t.Optional[str] = None
|
|
42
|
+
google_credentials: t.Optional[str] = None
|
|
43
|
+
google_service_account: t.Optional[str] = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class SimpleDatabricksVolumesConfig(BaseConnectorConfig):
|
|
48
|
+
access_config: DatabricksVolumesAccessConfig
|
|
49
|
+
host: t.Optional[str] = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class DatabricksVolumesWriteConfig(WriteConfig):
|
|
54
|
+
volume: str
|
|
55
|
+
catalog: str
|
|
56
|
+
volume_path: t.Optional[str] = None
|
|
57
|
+
overwrite: bool = False
|
|
58
|
+
encoding: str = "utf-8"
|
|
59
|
+
schema: str = "default"
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def path(self) -> str:
|
|
63
|
+
path = f"/Volumes/{self.catalog}/{self.schema}/{self.volume}"
|
|
64
|
+
if self.volume_path:
|
|
65
|
+
path = f"{path}/{self.volume_path}"
|
|
66
|
+
return path
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class DatabricksVolumesDestinationConnector(BaseDestinationConnector):
|
|
71
|
+
write_config: DatabricksVolumesWriteConfig
|
|
72
|
+
connector_config: SimpleDatabricksVolumesConfig
|
|
73
|
+
_client: t.Optional["WorkspaceClient"] = field(init=False, default=None)
|
|
74
|
+
|
|
75
|
+
def to_dict(self, **kwargs):
|
|
76
|
+
self_cp = copy.copy(self)
|
|
77
|
+
if hasattr(self_cp, "_client"):
|
|
78
|
+
setattr(self_cp, "_client", None)
|
|
79
|
+
return _asdict(self_cp, **kwargs)
|
|
80
|
+
|
|
81
|
+
@requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
|
|
82
|
+
def generate_client(self) -> "WorkspaceClient":
|
|
83
|
+
from databricks.sdk import WorkspaceClient
|
|
84
|
+
|
|
85
|
+
return WorkspaceClient(
|
|
86
|
+
host=self.connector_config.host, **self.connector_config.access_config.to_dict()
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def client(self) -> "WorkspaceClient":
|
|
91
|
+
if self._client is None:
|
|
92
|
+
self._client = self.generate_client()
|
|
93
|
+
return self._client
|
|
94
|
+
|
|
95
|
+
def check_connection(self):
|
|
96
|
+
try:
|
|
97
|
+
assert self.client.current_user.me().active
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
100
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
101
|
+
|
|
102
|
+
def initialize(self):
|
|
103
|
+
_ = self.client
|
|
104
|
+
|
|
105
|
+
def write_dict(
|
|
106
|
+
self,
|
|
107
|
+
*args,
|
|
108
|
+
elements_dict: t.List[t.Dict[str, t.Any]],
|
|
109
|
+
filename: t.Optional[str] = None,
|
|
110
|
+
indent: int = 4,
|
|
111
|
+
encoding: str = "utf-8",
|
|
112
|
+
**kwargs,
|
|
113
|
+
) -> None:
|
|
114
|
+
output_folder = self.write_config.path
|
|
115
|
+
output_folder = os.path.join(output_folder) # Make sure folder ends with file seperator
|
|
116
|
+
filename = (
|
|
117
|
+
filename.strip(os.sep) if filename else filename
|
|
118
|
+
) # Make sure filename doesn't begin with file seperator
|
|
119
|
+
output_path = str(PurePath(output_folder, filename)) if filename else output_folder
|
|
120
|
+
logger.debug(f"uploading content to {output_path}")
|
|
121
|
+
self.client.files.upload(
|
|
122
|
+
file_path=output_path,
|
|
123
|
+
contents=BytesIO(json.dumps(elements_dict).encode(encoding=self.write_config.encoding)),
|
|
124
|
+
overwrite=self.write_config.overwrite,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def get_elements_dict(self, docs: t.List[BaseSingleIngestDoc]) -> t.List[t.Dict[str, t.Any]]:
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
def write(self, docs: t.List[BaseSingleIngestDoc]) -> None:
|
|
131
|
+
for doc in docs:
|
|
132
|
+
file_path = doc.base_output_filename
|
|
133
|
+
filename = file_path if file_path else None
|
|
134
|
+
with open(doc._output_filename) as json_file:
|
|
135
|
+
logger.debug(f"uploading content from {doc._output_filename}")
|
|
136
|
+
json_list = json.load(json_file)
|
|
137
|
+
self.write_dict(elements_dict=json_list, filename=filename)
|