unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import typing as t
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from html import unescape
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
11
|
+
from unstructured_ingest.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
BaseConnectorConfig,
|
|
14
|
+
BaseSingleIngestDoc,
|
|
15
|
+
BaseSourceConnector,
|
|
16
|
+
IngestDocCleanupMixin,
|
|
17
|
+
SourceConnectorCleanupMixin,
|
|
18
|
+
SourceMetadata,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.interfaces import PermissionsConfig as SharepointPermissionsConfig
|
|
21
|
+
from unstructured_ingest.logger import logger
|
|
22
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
23
|
+
from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime
|
|
24
|
+
|
|
25
|
+
if t.TYPE_CHECKING:
|
|
26
|
+
from office365.sharepoint.client_context import ClientContext
|
|
27
|
+
from office365.sharepoint.files.file import File
|
|
28
|
+
from office365.sharepoint.publishing.pages.page import SitePage
|
|
29
|
+
|
|
30
|
+
MAX_MB_SIZE = 512_000_000
|
|
31
|
+
CONTENT_LABELS = ["CanvasContent1", "LayoutWebpartsContent1", "TimeCreated"]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class SharepointAccessConfig(AccessConfig):
|
|
36
|
+
client_cred: str = enhanced_field(repr=False, sensitive=True)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class SimpleSharepointConfig(BaseConnectorConfig):
|
|
41
|
+
access_config: SharepointAccessConfig
|
|
42
|
+
client_id: str
|
|
43
|
+
site: str
|
|
44
|
+
path: str
|
|
45
|
+
process_pages: bool = enhanced_field(default=True, init=False)
|
|
46
|
+
recursive: bool = False
|
|
47
|
+
files_only: bool = False
|
|
48
|
+
permissions_config: t.Optional[SharepointPermissionsConfig] = None
|
|
49
|
+
|
|
50
|
+
def __post_init__(self):
|
|
51
|
+
if not (self.client_id and self.access_config.client_cred and self.site):
|
|
52
|
+
raise ValueError(
|
|
53
|
+
"Please provide one of the following mandatory values:"
|
|
54
|
+
"\n--client-id\n--client-cred\n--site",
|
|
55
|
+
)
|
|
56
|
+
self.process_pages = not self.files_only
|
|
57
|
+
|
|
58
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
59
|
+
def get_site_client(self, site_url: str = "") -> "ClientContext":
|
|
60
|
+
from office365.runtime.auth.client_credential import ClientCredential
|
|
61
|
+
from office365.sharepoint.client_context import ClientContext
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
site_client = ClientContext(site_url or self.site).with_credentials(
|
|
65
|
+
ClientCredential(self.client_id, self.access_config.client_cred),
|
|
66
|
+
)
|
|
67
|
+
except Exception:
|
|
68
|
+
logger.error("Couldn't set Sharepoint client.")
|
|
69
|
+
raise
|
|
70
|
+
return site_client
|
|
71
|
+
|
|
72
|
+
def get_permissions_client(self):
|
|
73
|
+
try:
|
|
74
|
+
permissions_connector = SharepointPermissionsConnector(self.permissions_config)
|
|
75
|
+
assert permissions_connector.access_token
|
|
76
|
+
return permissions_connector
|
|
77
|
+
except Exception as e:
|
|
78
|
+
logger.error("Couldn't obtain Sharepoint permissions ingestion access token:", e)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class SharepointIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
83
|
+
connector_config: SimpleSharepointConfig
|
|
84
|
+
site_url: str
|
|
85
|
+
server_path: str
|
|
86
|
+
is_page: bool
|
|
87
|
+
file_path: str
|
|
88
|
+
registry_name: str = "sharepoint"
|
|
89
|
+
|
|
90
|
+
def __post_init__(self):
|
|
91
|
+
self.extension = Path(self.file_path).suffix if not self.is_page else ".html"
|
|
92
|
+
self.extension = ".html" if self.extension == ".aspx" else self.extension
|
|
93
|
+
if not self.extension:
|
|
94
|
+
raise ValueError("Unsupported file without extension.")
|
|
95
|
+
|
|
96
|
+
self._set_download_paths()
|
|
97
|
+
|
|
98
|
+
def _set_download_paths(self) -> None:
|
|
99
|
+
"""Parses the folder structure from the source and creates the download and output paths"""
|
|
100
|
+
download_path = Path(f"{self.read_config.download_dir}")
|
|
101
|
+
output_path = Path(f"{self.processor_config.output_dir}")
|
|
102
|
+
parent = Path(self.file_path).with_suffix(self.extension)
|
|
103
|
+
self.download_dir = (download_path / parent.parent).resolve()
|
|
104
|
+
self.download_filepath = (download_path / parent).resolve()
|
|
105
|
+
output_filename = str(parent) + ".json"
|
|
106
|
+
self.output_dir = (output_path / parent.parent).resolve()
|
|
107
|
+
self.output_filepath = (output_path / output_filename).resolve()
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def filename(self):
|
|
111
|
+
return Path(self.download_filepath).resolve()
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def _output_filename(self):
|
|
115
|
+
return Path(self.output_filepath).resolve()
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
119
|
+
return {
|
|
120
|
+
"server_path": self.server_path,
|
|
121
|
+
"site_url": self.site_url,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
@SourceConnectionNetworkError.wrap
|
|
125
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
126
|
+
def _fetch_file(self, properties_only: bool = False):
|
|
127
|
+
"""Retrieves the actual page/file from the Sharepoint instance"""
|
|
128
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
129
|
+
|
|
130
|
+
site_client = self.connector_config.get_site_client(self.site_url)
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
if self.is_page:
|
|
134
|
+
file = site_client.web.get_file_by_server_relative_path("/" + self.server_path)
|
|
135
|
+
file = file.listItemAllFields.select(CONTENT_LABELS).get().execute_query()
|
|
136
|
+
else:
|
|
137
|
+
file = site_client.web.get_file_by_server_relative_url(self.server_path)
|
|
138
|
+
if properties_only:
|
|
139
|
+
file = file.get().execute_query()
|
|
140
|
+
except ClientRequestException as e:
|
|
141
|
+
if e.response.status_code == 404:
|
|
142
|
+
return None
|
|
143
|
+
raise
|
|
144
|
+
return file
|
|
145
|
+
|
|
146
|
+
def _fetch_page(self):
|
|
147
|
+
site_client = self.connector_config.get_site_client(self.site_url)
|
|
148
|
+
try:
|
|
149
|
+
page = (
|
|
150
|
+
site_client.site_pages.pages.get_by_url(self.server_path)
|
|
151
|
+
.expand(["FirstPublished", "Modified", "Version"])
|
|
152
|
+
.get()
|
|
153
|
+
.execute_query()
|
|
154
|
+
)
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logger.error(f"Failed to retrieve page {self.server_path} from site {self.site_url}")
|
|
157
|
+
logger.error(e)
|
|
158
|
+
return None
|
|
159
|
+
return page
|
|
160
|
+
|
|
161
|
+
def update_permissions_data(self):
|
|
162
|
+
def parent_name_matches(parent_type, permissions_filename, ingest_doc_filepath):
|
|
163
|
+
permissions_filename = permissions_filename.split("_SEP_")
|
|
164
|
+
ingest_doc_filepath = ingest_doc_filepath.split("/")
|
|
165
|
+
|
|
166
|
+
if parent_type == "sites":
|
|
167
|
+
return permissions_filename[0] == ingest_doc_filepath[1]
|
|
168
|
+
|
|
169
|
+
elif parent_type == "SitePages" or parent_type == "Shared Documents":
|
|
170
|
+
return True
|
|
171
|
+
|
|
172
|
+
permissions_data = None
|
|
173
|
+
permissions_dir = Path(self.processor_config.output_dir) / "permissions_data"
|
|
174
|
+
|
|
175
|
+
if permissions_dir.is_dir():
|
|
176
|
+
parent_type = self.file_path.split("/")[0]
|
|
177
|
+
|
|
178
|
+
if parent_type == "sites":
|
|
179
|
+
read_dir = permissions_dir / "sites"
|
|
180
|
+
elif parent_type == "SitePages" or parent_type == "Shared Documents":
|
|
181
|
+
read_dir = permissions_dir / "other"
|
|
182
|
+
else:
|
|
183
|
+
read_dir = permissions_dir / "other"
|
|
184
|
+
|
|
185
|
+
for filename in os.listdir(read_dir):
|
|
186
|
+
permissions_docname = os.path.splitext(filename)[0].split("_SEP_")[1]
|
|
187
|
+
ingestdoc_docname = self.file_path.split("/")[-1]
|
|
188
|
+
|
|
189
|
+
if ingestdoc_docname == permissions_docname and parent_name_matches(
|
|
190
|
+
parent_type=parent_type,
|
|
191
|
+
permissions_filename=filename,
|
|
192
|
+
ingest_doc_filepath=self.file_path,
|
|
193
|
+
):
|
|
194
|
+
with open(read_dir / filename) as f:
|
|
195
|
+
permissions_data = json.loads(f.read())
|
|
196
|
+
|
|
197
|
+
return permissions_data
|
|
198
|
+
|
|
199
|
+
def update_source_metadata(self, **kwargs):
|
|
200
|
+
if self.is_page:
|
|
201
|
+
page = self._fetch_page()
|
|
202
|
+
if page is None:
|
|
203
|
+
self.source_metadata = SourceMetadata(
|
|
204
|
+
exists=False,
|
|
205
|
+
)
|
|
206
|
+
return
|
|
207
|
+
self.source_metadata = SourceMetadata(
|
|
208
|
+
date_created=page.get_property("FirstPublished", None),
|
|
209
|
+
date_modified=page.get_property("Modified", None),
|
|
210
|
+
version=page.get_property("Version", ""),
|
|
211
|
+
source_url=page.absolute_url,
|
|
212
|
+
exists=True,
|
|
213
|
+
permissions_data=(
|
|
214
|
+
self.update_permissions_data()
|
|
215
|
+
if self.connector_config.permissions_config
|
|
216
|
+
else None
|
|
217
|
+
),
|
|
218
|
+
)
|
|
219
|
+
return
|
|
220
|
+
|
|
221
|
+
file = self._fetch_file(True)
|
|
222
|
+
if file is None:
|
|
223
|
+
self.source_metadata = SourceMetadata(
|
|
224
|
+
exists=False,
|
|
225
|
+
)
|
|
226
|
+
return
|
|
227
|
+
self.source_metadata = SourceMetadata(
|
|
228
|
+
date_created=ensure_isoformat_datetime(timestamp=file.time_created),
|
|
229
|
+
date_modified=ensure_isoformat_datetime(timestamp=file.time_last_modified),
|
|
230
|
+
version=file.major_version,
|
|
231
|
+
source_url=file.properties.get("LinkingUrl", None),
|
|
232
|
+
exists=True,
|
|
233
|
+
permissions_data=(
|
|
234
|
+
self.update_permissions_data() if self.connector_config.permissions_config else None
|
|
235
|
+
),
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def _download_page(self):
|
|
239
|
+
"""Formats and saves locally page content"""
|
|
240
|
+
content = self._fetch_file()
|
|
241
|
+
self.update_source_metadata()
|
|
242
|
+
pld = (content.properties.get("LayoutWebpartsContent1", "") or "") + (
|
|
243
|
+
content.properties.get("CanvasContent1", "") or ""
|
|
244
|
+
)
|
|
245
|
+
if pld != "":
|
|
246
|
+
pld = unescape(pld)
|
|
247
|
+
else:
|
|
248
|
+
logger.info(
|
|
249
|
+
f"Page {self.server_path} has no retrievable content. \
|
|
250
|
+
Dumping empty doc.",
|
|
251
|
+
)
|
|
252
|
+
pld = "<div></div>"
|
|
253
|
+
|
|
254
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
255
|
+
if not self.download_dir.is_dir():
|
|
256
|
+
logger.debug(f"Creating directory: {self.download_dir}")
|
|
257
|
+
self.download_dir.mkdir(parents=True, exist_ok=True)
|
|
258
|
+
with self.filename.open(mode="w") as f:
|
|
259
|
+
f.write(pld)
|
|
260
|
+
logger.info(f"File downloaded: {self.filename}")
|
|
261
|
+
|
|
262
|
+
def _download_file(self):
|
|
263
|
+
file = self._fetch_file()
|
|
264
|
+
self.update_source_metadata()
|
|
265
|
+
fsize = file.length
|
|
266
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
267
|
+
|
|
268
|
+
if not self.download_dir.is_dir():
|
|
269
|
+
logger.debug(f"Creating directory: {self.download_dir}")
|
|
270
|
+
self.download_dir.mkdir(parents=True, exist_ok=True)
|
|
271
|
+
|
|
272
|
+
if fsize > MAX_MB_SIZE:
|
|
273
|
+
logger.info(f"Downloading file with size: {fsize} bytes in chunks")
|
|
274
|
+
with self.filename.open(mode="wb") as f:
|
|
275
|
+
file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
|
|
276
|
+
else:
|
|
277
|
+
with self.filename.open(mode="wb") as f:
|
|
278
|
+
file.download(f).execute_query()
|
|
279
|
+
logger.info(f"File downloaded: {self.filename}")
|
|
280
|
+
|
|
281
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
282
|
+
@SourceConnectionError.wrap
|
|
283
|
+
@requires_dependencies(["office365"])
|
|
284
|
+
def get_file(self):
|
|
285
|
+
if self.is_page:
|
|
286
|
+
self._download_page()
|
|
287
|
+
else:
|
|
288
|
+
self._download_file()
|
|
289
|
+
return
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
@dataclass
|
|
293
|
+
class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
294
|
+
connector_config: SimpleSharepointConfig
|
|
295
|
+
|
|
296
|
+
def check_connection(self):
|
|
297
|
+
try:
|
|
298
|
+
site_client = self.connector_config.get_site_client()
|
|
299
|
+
site_client.site_pages.pages.get().execute_query()
|
|
300
|
+
except Exception as e:
|
|
301
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
302
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
303
|
+
|
|
304
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
305
|
+
def _list_files(self, folder, recursive) -> t.List["File"]:
|
|
306
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
objects = folder.expand(["Files", "Folders"]).get().execute_query()
|
|
310
|
+
files = list(objects.files)
|
|
311
|
+
if not recursive:
|
|
312
|
+
return files
|
|
313
|
+
for f in objects.folders:
|
|
314
|
+
if "/Forms" in f.serverRelativeUrl:
|
|
315
|
+
continue
|
|
316
|
+
files += self._list_files(f, recursive)
|
|
317
|
+
return files
|
|
318
|
+
except ClientRequestException as e:
|
|
319
|
+
if e.response.status_code != 404:
|
|
320
|
+
logger.info("Caught an error while processing documents %s", e.response.text)
|
|
321
|
+
return []
|
|
322
|
+
|
|
323
|
+
def _prepare_ingest_doc(self, obj: t.Union["File", "SitePage"], base_url, is_page=False):
|
|
324
|
+
if is_page:
|
|
325
|
+
file_path = obj.get_property("Url", "")
|
|
326
|
+
server_path = file_path if file_path[0] != "/" else file_path[1:]
|
|
327
|
+
if (url_path := (urlparse(base_url).path)) and (url_path != "/"):
|
|
328
|
+
file_path = url_path[1:] + "/" + file_path
|
|
329
|
+
else:
|
|
330
|
+
server_path = obj.serverRelativeUrl
|
|
331
|
+
file_path = obj.serverRelativeUrl[1:]
|
|
332
|
+
|
|
333
|
+
return SharepointIngestDoc(
|
|
334
|
+
processor_config=self.processor_config,
|
|
335
|
+
read_config=self.read_config,
|
|
336
|
+
connector_config=self.connector_config,
|
|
337
|
+
site_url=base_url,
|
|
338
|
+
server_path=server_path,
|
|
339
|
+
is_page=is_page,
|
|
340
|
+
file_path=file_path,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
344
|
+
def _list_pages(self, site_client) -> list:
|
|
345
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
site_pages = site_client.site_pages.pages.get().execute_query()
|
|
349
|
+
except ClientRequestException as e:
|
|
350
|
+
logger.info(
|
|
351
|
+
"Caught an error while retrieving site pages from %s \n%s",
|
|
352
|
+
site_client.base_url,
|
|
353
|
+
e.response.text,
|
|
354
|
+
)
|
|
355
|
+
return []
|
|
356
|
+
|
|
357
|
+
return [self._prepare_ingest_doc(page, site_client.base_url, True) for page in site_pages]
|
|
358
|
+
|
|
359
|
+
def _ingest_site_docs(self, site_client) -> t.List["SharepointIngestDoc"]:
|
|
360
|
+
root_folder = site_client.web.get_folder_by_server_relative_path(self.connector_config.path)
|
|
361
|
+
files = self._list_files(root_folder, self.connector_config.recursive)
|
|
362
|
+
if not files:
|
|
363
|
+
logger.info(
|
|
364
|
+
f"No processable files at path {self.connector_config.path}\
|
|
365
|
+
for site {site_client.base_url}",
|
|
366
|
+
)
|
|
367
|
+
output = []
|
|
368
|
+
for file in files:
|
|
369
|
+
try:
|
|
370
|
+
output.append(self._prepare_ingest_doc(file, site_client.base_url))
|
|
371
|
+
except ValueError as e:
|
|
372
|
+
logger.error("Unable to process file %s", file.properties["Name"])
|
|
373
|
+
logger.error(e)
|
|
374
|
+
if self.connector_config.process_pages:
|
|
375
|
+
page_output = self._list_pages(site_client)
|
|
376
|
+
if not page_output:
|
|
377
|
+
logger.info(f"Couldn't process pages for site {site_client.base_url}")
|
|
378
|
+
output = output + page_output
|
|
379
|
+
return output
|
|
380
|
+
|
|
381
|
+
def initialize(self):
|
|
382
|
+
pass
|
|
383
|
+
|
|
384
|
+
def get_ingest_docs(self):
|
|
385
|
+
base_site_client = self.connector_config.get_site_client()
|
|
386
|
+
|
|
387
|
+
if not all(
|
|
388
|
+
getattr(self.connector_config.permissions_config, attr, False)
|
|
389
|
+
for attr in ["application_id", "client_cred", "tenant"]
|
|
390
|
+
):
|
|
391
|
+
logger.info(
|
|
392
|
+
"Permissions config is not fed with 'application_id', 'client_cred' and 'tenant'."
|
|
393
|
+
"Skipping permissions ingestion.",
|
|
394
|
+
)
|
|
395
|
+
else:
|
|
396
|
+
permissions_client = self.connector_config.get_permissions_client()
|
|
397
|
+
if permissions_client:
|
|
398
|
+
permissions_client.write_all_permissions(self.processor_config.output_dir)
|
|
399
|
+
|
|
400
|
+
if not base_site_client.is_tenant:
|
|
401
|
+
return self._ingest_site_docs(base_site_client)
|
|
402
|
+
tenant = base_site_client.tenant
|
|
403
|
+
tenant_sites = tenant.get_site_properties_from_sharepoint_by_filters().execute_query()
|
|
404
|
+
tenant_sites = {s.url for s in tenant_sites if (s.url is not None)}
|
|
405
|
+
ingest_docs: t.List[SharepointIngestDoc] = []
|
|
406
|
+
for site_url in tenant_sites:
|
|
407
|
+
logger.info(f"Processing docs for site: {site_url}")
|
|
408
|
+
site_client = self.connector_config.get_site_client(site_url)
|
|
409
|
+
ingest_docs = ingest_docs + self._ingest_site_docs(site_client)
|
|
410
|
+
return ingest_docs
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
@dataclass
|
|
414
|
+
class SharepointPermissionsConnector:
|
|
415
|
+
def __init__(self, permissions_config):
|
|
416
|
+
self.permissions_config: SharepointPermissionsConfig = permissions_config
|
|
417
|
+
self.initialize()
|
|
418
|
+
|
|
419
|
+
def initialize(self):
|
|
420
|
+
self.access_token: str = self.get_access_token()
|
|
421
|
+
|
|
422
|
+
@requires_dependencies(["requests"], extras="sharepoint")
|
|
423
|
+
def get_access_token(self) -> str:
|
|
424
|
+
import requests
|
|
425
|
+
|
|
426
|
+
url = (
|
|
427
|
+
f"https://login.microsoftonline.com/{self.permissions_config.tenant}/oauth2/v2.0/token"
|
|
428
|
+
)
|
|
429
|
+
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
|
430
|
+
data = {
|
|
431
|
+
"client_id": self.permissions_config.application_id,
|
|
432
|
+
"scope": "https://graph.microsoft.com/.default",
|
|
433
|
+
"client_secret": self.permissions_config.client_cred,
|
|
434
|
+
"grant_type": "client_credentials",
|
|
435
|
+
}
|
|
436
|
+
response = requests.post(url, headers=headers, data=data)
|
|
437
|
+
return response.json()["access_token"]
|
|
438
|
+
|
|
439
|
+
def validated_response(self, response):
|
|
440
|
+
if response.status_code == 200:
|
|
441
|
+
return response.json()
|
|
442
|
+
else:
|
|
443
|
+
logger.info(f"Request failed with status code {response.status_code}:")
|
|
444
|
+
logger.info(response.text)
|
|
445
|
+
|
|
446
|
+
@requires_dependencies(["requests"], extras="sharepoint")
|
|
447
|
+
def get_sites(self):
|
|
448
|
+
import requests
|
|
449
|
+
|
|
450
|
+
url = "https://graph.microsoft.com/v1.0/sites"
|
|
451
|
+
params = {
|
|
452
|
+
"$select": "webUrl, id",
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
headers = {
|
|
456
|
+
"Authorization": f"Bearer {self.access_token}",
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
response = requests.get(url, params=params, headers=headers)
|
|
460
|
+
return self.validated_response(response)
|
|
461
|
+
|
|
462
|
+
@requires_dependencies(["requests"], extras="sharepoint")
|
|
463
|
+
def get_drives(self, site):
|
|
464
|
+
import requests
|
|
465
|
+
|
|
466
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives"
|
|
467
|
+
|
|
468
|
+
headers = {
|
|
469
|
+
"Authorization": f"Bearer {self.access_token}",
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
response = requests.get(url, headers=headers)
|
|
473
|
+
|
|
474
|
+
return self.validated_response(response)
|
|
475
|
+
|
|
476
|
+
@requires_dependencies(["requests"], extras="sharepoint")
|
|
477
|
+
def get_drive_items(self, site, drive_id):
|
|
478
|
+
import requests
|
|
479
|
+
|
|
480
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives/{drive_id}/root/children"
|
|
481
|
+
|
|
482
|
+
headers = {
|
|
483
|
+
"Authorization": f"Bearer {self.access_token}",
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
response = requests.get(url, headers=headers)
|
|
487
|
+
|
|
488
|
+
return self.validated_response(response)
|
|
489
|
+
|
|
490
|
+
def extract_site_name_from_weburl(self, weburl):
|
|
491
|
+
split_path = urlparse(weburl).path.lstrip("/").split("/")
|
|
492
|
+
|
|
493
|
+
if split_path[0] == "sites":
|
|
494
|
+
return "sites", split_path[1]
|
|
495
|
+
|
|
496
|
+
elif split_path[0] == "Shared%20Documents":
|
|
497
|
+
return "Shared Documents", "Shared Documents"
|
|
498
|
+
|
|
499
|
+
elif split_path[0] == "personal":
|
|
500
|
+
return "Personal", "Personal"
|
|
501
|
+
|
|
502
|
+
elif split_path[0] == "_layouts":
|
|
503
|
+
return "layouts", "layouts"
|
|
504
|
+
|
|
505
|
+
# if other weburl structures are found, additional logic might need to be implemented
|
|
506
|
+
|
|
507
|
+
logger.warning(
|
|
508
|
+
"""Couldn't extract sitename, unknown site or parent type. Skipping permissions
|
|
509
|
+
ingestion for the document with the URL:""",
|
|
510
|
+
weburl,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
return None, None
|
|
514
|
+
|
|
515
|
+
@requires_dependencies(["requests"], extras="sharepoint")
|
|
516
|
+
def get_permissions_for_drive_item(self, site, drive_id, item_id):
|
|
517
|
+
import requests
|
|
518
|
+
|
|
519
|
+
url = f"https://graph.microsoft.com/v1.0/sites/ \
|
|
520
|
+
{site}/drives/{drive_id}/items/{item_id}/permissions"
|
|
521
|
+
|
|
522
|
+
headers = {
|
|
523
|
+
"Authorization": f"Bearer {self.access_token}",
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
response = requests.get(url, headers=headers)
|
|
527
|
+
|
|
528
|
+
return self.validated_response(response)
|
|
529
|
+
|
|
530
|
+
def write_all_permissions(self, output_dir):
|
|
531
|
+
sites = [(site["id"], site["webUrl"]) for site in self.get_sites()["value"]]
|
|
532
|
+
drive_ids = []
|
|
533
|
+
|
|
534
|
+
logger.info("Obtaining drive data for sites for permissions (rbac)")
|
|
535
|
+
for site_id, site_url in sites:
|
|
536
|
+
drives = self.get_drives(site_id)
|
|
537
|
+
if drives:
|
|
538
|
+
drives_for_site = drives["value"]
|
|
539
|
+
drive_ids.extend([(site_id, drive["id"]) for drive in drives_for_site])
|
|
540
|
+
|
|
541
|
+
logger.info("Obtaining item data from drives for permissions (rbac)")
|
|
542
|
+
item_ids = []
|
|
543
|
+
for site, drive_id in drive_ids:
|
|
544
|
+
drive_items = self.get_drive_items(site, drive_id)
|
|
545
|
+
if drive_items:
|
|
546
|
+
item_ids.extend(
|
|
547
|
+
[
|
|
548
|
+
(site, drive_id, item["id"], item["name"], item["webUrl"])
|
|
549
|
+
for item in drive_items["value"]
|
|
550
|
+
],
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
permissions_dir = Path(output_dir) / "permissions_data"
|
|
554
|
+
|
|
555
|
+
logger.info("Writing permissions data to disk")
|
|
556
|
+
for site, drive_id, item_id, item_name, item_web_url in item_ids:
|
|
557
|
+
res = self.get_permissions_for_drive_item(site, drive_id, item_id)
|
|
558
|
+
if res:
|
|
559
|
+
parent_type, parent_name = self.extract_site_name_from_weburl(item_web_url)
|
|
560
|
+
|
|
561
|
+
if parent_type == "sites":
|
|
562
|
+
write_path = permissions_dir / "sites" / f"{parent_name}_SEP_{item_name}.json"
|
|
563
|
+
|
|
564
|
+
elif parent_type == "Personal" or parent_type == "Shared Documents":
|
|
565
|
+
write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json"
|
|
566
|
+
else:
|
|
567
|
+
write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json"
|
|
568
|
+
|
|
569
|
+
if not Path(os.path.dirname(write_path)).is_dir():
|
|
570
|
+
os.makedirs(os.path.dirname(write_path))
|
|
571
|
+
|
|
572
|
+
with open(write_path, "w") as f:
|
|
573
|
+
json.dump(res["value"], f)
|