unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from time import time
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
+
from urllib.parse import quote
|
|
8
|
+
|
|
9
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
12
|
+
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
from unstructured_ingest.v2.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
Downloader,
|
|
18
|
+
DownloaderConfig,
|
|
19
|
+
DownloadResponse,
|
|
20
|
+
FileData,
|
|
21
|
+
Indexer,
|
|
22
|
+
IndexerConfig,
|
|
23
|
+
SourceIdentifiers,
|
|
24
|
+
download_responses,
|
|
25
|
+
)
|
|
26
|
+
from unstructured_ingest.v2.logger import logger
|
|
27
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
28
|
+
SourceRegistryEntry,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
from .utils import parse_datetime
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from office365.graph_client import GraphClient
|
|
35
|
+
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
36
|
+
from office365.onedrive.drives.drive import Drive
|
|
37
|
+
from office365.onedrive.permissions.permission import Permission
|
|
38
|
+
from office365.onedrive.sites.site import Site
|
|
39
|
+
from office365.sharepoint.client_context import ClientContext
|
|
40
|
+
from office365.sharepoint.files.file import File
|
|
41
|
+
from office365.sharepoint.folders.folder import Folder
|
|
42
|
+
from office365.sharepoint.publishing.pages.page import SitePage
|
|
43
|
+
|
|
44
|
+
CONNECTOR_TYPE = "sharepoint"
|
|
45
|
+
|
|
46
|
+
MAX_MB_SIZE = 512_000_000
|
|
47
|
+
|
|
48
|
+
# TODO handle other data types possible from Sharepoint
|
|
49
|
+
# exampled: https://github.com/vgrem/Office365-REST-Python-Client/tree/master/examples/sharepoint
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SharepointContentType(Enum):
|
|
53
|
+
DOCUMENT = "document"
|
|
54
|
+
SITEPAGE = "site_page"
|
|
55
|
+
LIST = "list"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class SharepointAccessConfig(AccessConfig):
|
|
60
|
+
client_cred: str
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class SharepointPermissionsConfig(EnhancedDataClassJsonMixin):
|
|
65
|
+
permissions_application_id: str
|
|
66
|
+
permissions_tenant: str
|
|
67
|
+
permissions_client_cred: str = enhanced_field(sensitive=True)
|
|
68
|
+
authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class SharepointConnectionConfig(ConnectionConfig):
|
|
73
|
+
client_id: str
|
|
74
|
+
site: str
|
|
75
|
+
access_config: SharepointAccessConfig = enhanced_field(sensitive=True)
|
|
76
|
+
permissions_config: Optional[SharepointPermissionsConfig] = None
|
|
77
|
+
|
|
78
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
79
|
+
def get_client(self) -> "ClientContext":
|
|
80
|
+
from office365.runtime.auth.client_credential import ClientCredential
|
|
81
|
+
from office365.sharepoint.client_context import ClientContext
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
credentials = ClientCredential(self.client_id, self.access_config.client_cred)
|
|
85
|
+
site_client = ClientContext(self.site).with_credentials(credentials)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"Couldn't set Sharepoint client: {e}")
|
|
88
|
+
raise e
|
|
89
|
+
return site_client
|
|
90
|
+
|
|
91
|
+
@requires_dependencies(["msal"], extras="sharepoint")
|
|
92
|
+
def get_permissions_token(self):
|
|
93
|
+
from msal import ConfidentialClientApplication
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
app = ConfidentialClientApplication(
|
|
97
|
+
authority=f"{self.permissions_config.authority_url}/"
|
|
98
|
+
f"{self.permissions_config.permissions_tenant}",
|
|
99
|
+
client_id=self.permissions_config.permissions_application_id,
|
|
100
|
+
client_credential=self.permissions_config.permissions_client_cred,
|
|
101
|
+
)
|
|
102
|
+
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
|
103
|
+
except ValueError as exc:
|
|
104
|
+
logger.error("Couldn't set up credentials for Sharepoint")
|
|
105
|
+
raise exc
|
|
106
|
+
if "error" in token:
|
|
107
|
+
raise SourceConnectionNetworkError(
|
|
108
|
+
"failed to fetch token, {}: {}".format(token["error"], token["error_description"])
|
|
109
|
+
)
|
|
110
|
+
return token
|
|
111
|
+
|
|
112
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
113
|
+
def get_permissions_client(self) -> Optional["GraphClient"]:
|
|
114
|
+
from office365.graph_client import GraphClient
|
|
115
|
+
|
|
116
|
+
if self.permissions_config is None:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
client = GraphClient(self.get_permissions_token)
|
|
120
|
+
return client
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class SharepointIndexerConfig(IndexerConfig):
|
|
125
|
+
path: Optional[str] = None
|
|
126
|
+
recursive: bool = False
|
|
127
|
+
omit_files: bool = False
|
|
128
|
+
omit_pages: bool = False
|
|
129
|
+
omit_lists: bool = False
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclass
|
|
133
|
+
class SharepointIndexer(Indexer):
|
|
134
|
+
connection_config: SharepointConnectionConfig
|
|
135
|
+
index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
|
|
136
|
+
|
|
137
|
+
def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
|
|
138
|
+
if not recursive:
|
|
139
|
+
folder.expand(["Files"]).get().execute_query()
|
|
140
|
+
return folder.files
|
|
141
|
+
|
|
142
|
+
folder.expand(["Files", "Folders"]).get().execute_query()
|
|
143
|
+
files: list["File"] = list(folder.files)
|
|
144
|
+
folders: list["Folder"] = list(folder.folders)
|
|
145
|
+
for f in folders:
|
|
146
|
+
if "/Forms" in f.serverRelativeUrl:
|
|
147
|
+
continue
|
|
148
|
+
files.extend(self.list_files(f, recursive))
|
|
149
|
+
return files
|
|
150
|
+
|
|
151
|
+
def get_properties(self, raw_properties: dict) -> dict:
|
|
152
|
+
raw_properties = {k: v for k, v in raw_properties.items() if v}
|
|
153
|
+
filtered_properties = {}
|
|
154
|
+
for k, v in raw_properties.items():
|
|
155
|
+
try:
|
|
156
|
+
json.dumps(v)
|
|
157
|
+
filtered_properties[k] = v
|
|
158
|
+
except TypeError:
|
|
159
|
+
pass
|
|
160
|
+
return filtered_properties
|
|
161
|
+
|
|
162
|
+
def list_pages(self, client: "ClientContext") -> list["SitePage"]:
|
|
163
|
+
pages = client.site_pages.pages.get().execute_query()
|
|
164
|
+
return pages
|
|
165
|
+
|
|
166
|
+
def page_to_file_data(self, site_page: "SitePage") -> FileData:
|
|
167
|
+
site_page.expand(site_page.properties.keys()).get().execute_query()
|
|
168
|
+
version = site_page.properties.get("Version", None)
|
|
169
|
+
unique_id = site_page.properties.get("UniqueId", None)
|
|
170
|
+
modified_date = site_page.properties.get("Modified", None)
|
|
171
|
+
url = site_page.properties.get("AbsoluteUrl", None)
|
|
172
|
+
date_modified_dt = parse_datetime(modified_date) if modified_date else None
|
|
173
|
+
date_created_at = (
|
|
174
|
+
parse_datetime(site_page.first_published)
|
|
175
|
+
if (site_page.first_published and site_page.first_published != "0001-01-01T08:00:00Z")
|
|
176
|
+
else None
|
|
177
|
+
)
|
|
178
|
+
file_path = site_page.get_property("Url", "")
|
|
179
|
+
server_path = file_path if file_path[0] != "/" else file_path[1:]
|
|
180
|
+
additional_metadata = self.get_properties(raw_properties=site_page.properties)
|
|
181
|
+
additional_metadata["sharepoint_content_type"] = SharepointContentType.SITEPAGE.value
|
|
182
|
+
return FileData(
|
|
183
|
+
identifier=unique_id,
|
|
184
|
+
connector_type=CONNECTOR_TYPE,
|
|
185
|
+
source_identifiers=SourceIdentifiers(
|
|
186
|
+
filename=site_page.file_name,
|
|
187
|
+
fullpath=file_path,
|
|
188
|
+
rel_path=file_path.replace(self.index_config.path, ""),
|
|
189
|
+
),
|
|
190
|
+
metadata=DataSourceMetadata(
|
|
191
|
+
url=url,
|
|
192
|
+
version=version,
|
|
193
|
+
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
194
|
+
date_created=str(date_created_at.timestamp()) if date_created_at else None,
|
|
195
|
+
date_processed=str(time()),
|
|
196
|
+
record_locator={
|
|
197
|
+
"server_path": server_path,
|
|
198
|
+
},
|
|
199
|
+
),
|
|
200
|
+
additional_metadata=additional_metadata,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def file_to_file_data(self, client: "ClientContext", file: "File") -> FileData:
|
|
204
|
+
file.expand(file.properties.keys()).get().execute_query()
|
|
205
|
+
absolute_url = f"{client.base_url}{quote(file.serverRelativeUrl)}"
|
|
206
|
+
date_modified_dt = (
|
|
207
|
+
parse_datetime(file.time_last_modified) if file.time_last_modified else None
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
date_created_at = parse_datetime(file.time_created) if file.time_created else None
|
|
211
|
+
additional_metadata = self.get_properties(raw_properties=file.properties)
|
|
212
|
+
additional_metadata["sharepoint_content_type"] = SharepointContentType.DOCUMENT.value
|
|
213
|
+
fullpath = str(file.serverRelativeUrl)
|
|
214
|
+
rel_path = fullpath.replace(self.index_config.path, "")
|
|
215
|
+
while rel_path[0] == "/":
|
|
216
|
+
rel_path = rel_path[1:]
|
|
217
|
+
return FileData(
|
|
218
|
+
identifier=file.unique_id,
|
|
219
|
+
connector_type=CONNECTOR_TYPE,
|
|
220
|
+
source_identifiers=SourceIdentifiers(
|
|
221
|
+
filename=file.name,
|
|
222
|
+
fullpath=fullpath,
|
|
223
|
+
rel_path=rel_path,
|
|
224
|
+
),
|
|
225
|
+
metadata=DataSourceMetadata(
|
|
226
|
+
url=absolute_url,
|
|
227
|
+
version=f"{file.major_version}.{file.minor_version}",
|
|
228
|
+
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
229
|
+
date_created=str(date_created_at.timestamp()) if date_created_at else None,
|
|
230
|
+
date_processed=str(time()),
|
|
231
|
+
record_locator={"server_path": file.serverRelativeUrl, "site_url": client.base_url},
|
|
232
|
+
),
|
|
233
|
+
additional_metadata=additional_metadata,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
def get_root(self, client: "ClientContext") -> "Folder":
|
|
237
|
+
if path := self.index_config.path:
|
|
238
|
+
return client.web.get_folder_by_server_relative_path(path)
|
|
239
|
+
default_document_library = client.web.default_document_library()
|
|
240
|
+
root_folder = default_document_library.root_folder
|
|
241
|
+
root_folder = root_folder.get().execute_query()
|
|
242
|
+
self.index_config.path = root_folder.name
|
|
243
|
+
return root_folder
|
|
244
|
+
|
|
245
|
+
def get_site_url(self, client: "ClientContext") -> str:
|
|
246
|
+
res = client.web.get().execute_query()
|
|
247
|
+
return res.url
|
|
248
|
+
|
|
249
|
+
def get_site(self, permissions_client: "GraphClient", site_url) -> "Site":
|
|
250
|
+
return permissions_client.sites.get_by_url(url=site_url).execute_query()
|
|
251
|
+
|
|
252
|
+
def get_permissions_items(self, site: "Site") -> list["DriveItem"]:
|
|
253
|
+
# TODO find a way to narrow this search down by name of drive
|
|
254
|
+
items: list["DriveItem"] = []
|
|
255
|
+
drives: list["Drive"] = site.drives.get_all().execute_query()
|
|
256
|
+
for drive in drives:
|
|
257
|
+
items.extend(drive.root.children.get_all().execute_query())
|
|
258
|
+
return items
|
|
259
|
+
|
|
260
|
+
def map_permission(self, permission: "Permission") -> dict:
|
|
261
|
+
return {
|
|
262
|
+
"id": permission.id,
|
|
263
|
+
"roles": list(permission.roles),
|
|
264
|
+
"share_id": permission.share_id,
|
|
265
|
+
"has_password": permission.has_password,
|
|
266
|
+
"link": permission.link.to_json(),
|
|
267
|
+
"granted_to_identities": permission.granted_to_identities.to_json(),
|
|
268
|
+
"granted_to": permission.granted_to.to_json(),
|
|
269
|
+
"granted_to_v2": permission.granted_to_v2.to_json(),
|
|
270
|
+
"granted_to_identities_v2": permission.granted_to_identities_v2.to_json(),
|
|
271
|
+
"invitation": permission.invitation.to_json(),
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
def enrich_permissions_on_files(self, all_file_data: list[FileData], site_url: str) -> None:
|
|
275
|
+
logger.debug("Enriching permissions on files")
|
|
276
|
+
permission_client = self.connection_config.get_permissions_client()
|
|
277
|
+
if permission_client is None:
|
|
278
|
+
return
|
|
279
|
+
site = self.get_site(permissions_client=permission_client, site_url=site_url)
|
|
280
|
+
existing_items = self.get_permissions_items(site=site)
|
|
281
|
+
for file_data in all_file_data:
|
|
282
|
+
etag = file_data.additional_metadata.get("ETag")
|
|
283
|
+
if not etag:
|
|
284
|
+
continue
|
|
285
|
+
matching_items = list(filter(lambda x: x.etag == etag, existing_items))
|
|
286
|
+
if not matching_items:
|
|
287
|
+
continue
|
|
288
|
+
if len(matching_items) > 1:
|
|
289
|
+
logger.warning(
|
|
290
|
+
"Found multiple drive items with etag matching {}, skipping: {}".format(
|
|
291
|
+
etag, ", ".join([i.name for i in matching_items])
|
|
292
|
+
)
|
|
293
|
+
)
|
|
294
|
+
continue
|
|
295
|
+
matching_item = matching_items[0]
|
|
296
|
+
permissions: list["Permission"] = matching_item.permissions.get_all().execute_query()
|
|
297
|
+
permissions_data = [
|
|
298
|
+
self.map_permission(permission=permission) for permission in permissions
|
|
299
|
+
]
|
|
300
|
+
file_data.metadata.permissions_data = permissions_data
|
|
301
|
+
|
|
302
|
+
@property
|
|
303
|
+
def process_permissions(self) -> bool:
|
|
304
|
+
return (
|
|
305
|
+
self.connection_config.permissions_config.permissions_tenant
|
|
306
|
+
and self.connection_config.permissions_config.permissions_client_cred
|
|
307
|
+
and self.connection_config.permissions_config.permissions_application_id
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
311
|
+
client = self.connection_config.get_client()
|
|
312
|
+
root_folder = self.get_root(client=client)
|
|
313
|
+
logger.debug(f"processing content from path: {self.index_config.path}")
|
|
314
|
+
if not self.index_config.omit_files:
|
|
315
|
+
files = self.list_files(root_folder, recursive=self.index_config.recursive)
|
|
316
|
+
file_data = [self.file_to_file_data(file=file, client=client) for file in files]
|
|
317
|
+
if self.process_permissions:
|
|
318
|
+
self.enrich_permissions_on_files(
|
|
319
|
+
all_file_data=file_data, site_url=self.get_site_url(client=client)
|
|
320
|
+
)
|
|
321
|
+
for file in file_data:
|
|
322
|
+
yield file
|
|
323
|
+
if not self.index_config.omit_pages:
|
|
324
|
+
pages = self.list_pages(client=client)
|
|
325
|
+
for page in pages:
|
|
326
|
+
file_data = self.page_to_file_data(site_page=page)
|
|
327
|
+
file_data.metadata.record_locator["site_url"] = client.base_url
|
|
328
|
+
yield file_data
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@dataclass
|
|
332
|
+
class SharepointDownloaderConfig(DownloaderConfig):
|
|
333
|
+
pass
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
@dataclass
|
|
337
|
+
class SharepointDownloader(Downloader):
|
|
338
|
+
connection_config: SharepointConnectionConfig
|
|
339
|
+
download_config: SharepointDownloaderConfig
|
|
340
|
+
connector_type: str = CONNECTOR_TYPE
|
|
341
|
+
|
|
342
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
343
|
+
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
344
|
+
rel_path = file_data.source_identifiers.fullpath
|
|
345
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
346
|
+
download_path = self.download_dir / Path(rel_path)
|
|
347
|
+
if content_type == SharepointContentType.SITEPAGE.value:
|
|
348
|
+
# Update output extension to html if site page
|
|
349
|
+
download_path = download_path.with_suffix(".html")
|
|
350
|
+
return download_path
|
|
351
|
+
|
|
352
|
+
def get_document(self, file_data: FileData) -> DownloadResponse:
|
|
353
|
+
client: "ClientContext" = self.connection_config.get_client()
|
|
354
|
+
file: "File" = client.web.get_file_by_id(unique_id=file_data.identifier)
|
|
355
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
356
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
357
|
+
logger.debug(
|
|
358
|
+
f"writing document content {file_data.source_identifiers.fullpath} to {download_path}"
|
|
359
|
+
)
|
|
360
|
+
with download_path.open("wb") as f:
|
|
361
|
+
file.download(f).execute_query()
|
|
362
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
363
|
+
|
|
364
|
+
def get_site_page(self, file_data: FileData) -> DownloadResponse:
|
|
365
|
+
# TODO fetch comments for site page as well
|
|
366
|
+
from lxml import etree, html
|
|
367
|
+
|
|
368
|
+
canvas_content_raw = file_data.additional_metadata.get("CanvasContent1")
|
|
369
|
+
layout_web_parts_content_raw = file_data.additional_metadata.get("LayoutWebpartsContent")
|
|
370
|
+
html_content = []
|
|
371
|
+
if layout_web_parts_content_raw:
|
|
372
|
+
layout_web_parts_content = json.loads(layout_web_parts_content_raw)
|
|
373
|
+
for web_part in layout_web_parts_content:
|
|
374
|
+
properties = web_part.get("properties", {})
|
|
375
|
+
if title := properties.get("title"):
|
|
376
|
+
html_content.append(f"<title>{title}</title>")
|
|
377
|
+
if canvas_content_raw:
|
|
378
|
+
canvas_content = json.loads(canvas_content_raw)
|
|
379
|
+
for content in canvas_content:
|
|
380
|
+
if inner_html := content.get("innerHTML"):
|
|
381
|
+
html_content.append(inner_html)
|
|
382
|
+
htmls = "".join(html_content)
|
|
383
|
+
content = f"<div>{htmls}</div>"
|
|
384
|
+
document = html.fromstring(content)
|
|
385
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
386
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
387
|
+
logger.debug(
|
|
388
|
+
f"writing site page content {file_data.source_identifiers.filename} to {download_path}"
|
|
389
|
+
)
|
|
390
|
+
with download_path.open("w") as f:
|
|
391
|
+
f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
|
|
392
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
393
|
+
|
|
394
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
395
|
+
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
396
|
+
if not content_type:
|
|
397
|
+
raise ValueError(
|
|
398
|
+
f"Missing sharepoint_content_type metadata: {file_data.additional_metadata}"
|
|
399
|
+
)
|
|
400
|
+
if content_type == SharepointContentType.DOCUMENT.value:
|
|
401
|
+
return self.get_document(file_data=file_data)
|
|
402
|
+
elif content_type == SharepointContentType.SITEPAGE.value:
|
|
403
|
+
return self.get_site_page(file_data=file_data)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
sharepoint_source_entry = SourceRegistryEntry(
|
|
407
|
+
connection_config=SharepointConnectionConfig,
|
|
408
|
+
indexer_config=SharepointIndexerConfig,
|
|
409
|
+
indexer=SharepointIndexer,
|
|
410
|
+
downloader_config=SharepointDownloaderConfig,
|
|
411
|
+
downloader=SharepointDownloader,
|
|
412
|
+
)
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import date, datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from dateutil import parser
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
12
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
15
|
+
from unstructured_ingest.v2.interfaces import (
|
|
16
|
+
AccessConfig,
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
FileData,
|
|
19
|
+
UploadContent,
|
|
20
|
+
Uploader,
|
|
21
|
+
UploaderConfig,
|
|
22
|
+
UploadStager,
|
|
23
|
+
UploadStagerConfig,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.logger import logger
|
|
26
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
27
|
+
DestinationRegistryEntry,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from singlestoredb.connection import Connection
|
|
32
|
+
|
|
33
|
+
CONNECTOR_TYPE = "singlestore"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class SingleStoreAccessConfig(AccessConfig):
|
|
38
|
+
password: Optional[str] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class SingleStoreConnectionConfig(ConnectionConfig):
|
|
43
|
+
host: Optional[str] = None
|
|
44
|
+
port: Optional[int] = None
|
|
45
|
+
user: Optional[str] = None
|
|
46
|
+
database: Optional[str] = None
|
|
47
|
+
access_config: SingleStoreAccessConfig = enhanced_field(sensitive=True)
|
|
48
|
+
|
|
49
|
+
@requires_dependencies(["singlestoredb"], extras="singlestore")
|
|
50
|
+
def get_connection(self) -> "Connection":
|
|
51
|
+
import singlestoredb as s2
|
|
52
|
+
|
|
53
|
+
conn = s2.connect(
|
|
54
|
+
host=self.host,
|
|
55
|
+
port=self.port,
|
|
56
|
+
database=self.database,
|
|
57
|
+
user=self.user,
|
|
58
|
+
password=self.access_config.password,
|
|
59
|
+
)
|
|
60
|
+
return conn
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class SingleStoreUploadStagerConfig(UploadStagerConfig):
|
|
65
|
+
drop_empty_cols: bool = False
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class SingleStoreUploadStager(UploadStager):
|
|
70
|
+
upload_stager_config: SingleStoreUploadStagerConfig
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def parse_date_string(date_string: str) -> date:
|
|
74
|
+
try:
|
|
75
|
+
timestamp = float(date_string)
|
|
76
|
+
return datetime.fromtimestamp(timestamp)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
logger.debug(f"date {date_string} string not a timestamp: {e}")
|
|
79
|
+
return parser.parse(date_string)
|
|
80
|
+
|
|
81
|
+
def run(
|
|
82
|
+
self,
|
|
83
|
+
elements_filepath: Path,
|
|
84
|
+
file_data: FileData,
|
|
85
|
+
output_dir: Path,
|
|
86
|
+
output_filename: str,
|
|
87
|
+
**kwargs: Any,
|
|
88
|
+
) -> Path:
|
|
89
|
+
with open(elements_filepath) as elements_file:
|
|
90
|
+
elements_contents = json.load(elements_file)
|
|
91
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.csv")
|
|
92
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
|
|
94
|
+
df = convert_to_pandas_dataframe(
|
|
95
|
+
elements_dict=elements_contents,
|
|
96
|
+
drop_empty_cols=self.upload_stager_config.drop_empty_cols,
|
|
97
|
+
)
|
|
98
|
+
datetime_columns = [
|
|
99
|
+
"data_source_date_created",
|
|
100
|
+
"data_source_date_modified",
|
|
101
|
+
"data_source_date_processed",
|
|
102
|
+
]
|
|
103
|
+
for column in filter(lambda x: x in df.columns, datetime_columns):
|
|
104
|
+
df[column] = df[column].apply(self.parse_date_string)
|
|
105
|
+
if "data_source_record_locator" in df.columns:
|
|
106
|
+
df["data_source_record_locator"] = df["data_source_record_locator"].apply(
|
|
107
|
+
lambda x: json.dumps(x) if x else None
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
with output_path.open("w") as output_file:
|
|
111
|
+
df.to_csv(output_file, index=False)
|
|
112
|
+
return output_path
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class SingleStoreUploaderConfig(UploaderConfig):
|
|
117
|
+
table_name: str
|
|
118
|
+
batch_size: int = 100
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class SingleStoreUploader(Uploader):
|
|
123
|
+
connection_config: SingleStoreConnectionConfig
|
|
124
|
+
upload_config: SingleStoreUploaderConfig
|
|
125
|
+
connector_type: str = CONNECTOR_TYPE
|
|
126
|
+
|
|
127
|
+
def upload_csv(self, content: UploadContent) -> None:
|
|
128
|
+
df = pd.read_csv(content.path)
|
|
129
|
+
logger.debug(
|
|
130
|
+
f"uploading {len(df)} entries to {self.connection_config.database} "
|
|
131
|
+
f"db in table {self.upload_config.table_name}"
|
|
132
|
+
)
|
|
133
|
+
stmt = "INSERT INTO {} ({}) VALUES ({})".format(
|
|
134
|
+
self.upload_config.table_name,
|
|
135
|
+
", ".join(df.columns),
|
|
136
|
+
", ".join(["%s"] * len(df.columns)),
|
|
137
|
+
)
|
|
138
|
+
logger.debug(f"sql statement: {stmt}")
|
|
139
|
+
df.replace({np.nan: None}, inplace=True)
|
|
140
|
+
data_as_tuples = list(df.itertuples(index=False, name=None))
|
|
141
|
+
with self.connection_config.get_connection() as conn:
|
|
142
|
+
with conn.cursor() as cur:
|
|
143
|
+
for chunk in batch_generator(
|
|
144
|
+
data_as_tuples, batch_size=self.upload_config.batch_size
|
|
145
|
+
):
|
|
146
|
+
cur.executemany(stmt, chunk)
|
|
147
|
+
conn.commit()
|
|
148
|
+
|
|
149
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
150
|
+
for content in contents:
|
|
151
|
+
self.upload_csv(content=content)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
singlestore_destination_entry = DestinationRegistryEntry(
|
|
155
|
+
connection_config=SingleStoreConnectionConfig,
|
|
156
|
+
uploader=SingleStoreUploader,
|
|
157
|
+
uploader_config=SingleStoreUploaderConfig,
|
|
158
|
+
upload_stager=SingleStoreUploadStager,
|
|
159
|
+
upload_stager_config=SingleStoreUploadStagerConfig,
|
|
160
|
+
)
|