unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Salesforce Connector
|
|
3
|
+
Able to download Account, Case, Campaign, EmailMessage, Lead
|
|
4
|
+
Salesforce returns everything as a list of json.
|
|
5
|
+
This saves each entry as a separate file to be partitioned.
|
|
6
|
+
Using JWT authorization
|
|
7
|
+
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm
|
|
8
|
+
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from collections import OrderedDict
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from email.utils import formatdate
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from string import Template
|
|
17
|
+
from textwrap import dedent
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Generator, Type
|
|
19
|
+
|
|
20
|
+
from dateutil import parser
|
|
21
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
22
|
+
|
|
23
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
24
|
+
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
25
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
26
|
+
from unstructured_ingest.v2.interfaces import (
|
|
27
|
+
AccessConfig,
|
|
28
|
+
ConnectionConfig,
|
|
29
|
+
Downloader,
|
|
30
|
+
DownloaderConfig,
|
|
31
|
+
DownloadResponse,
|
|
32
|
+
FileData,
|
|
33
|
+
Indexer,
|
|
34
|
+
IndexerConfig,
|
|
35
|
+
SourceIdentifiers,
|
|
36
|
+
)
|
|
37
|
+
from unstructured_ingest.v2.logger import logger
|
|
38
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
39
|
+
SourceRegistryEntry,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class MissingCategoryError(Exception):
|
|
44
|
+
"""There are no categories with that name."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
CONNECTOR_TYPE = "salesforce"
|
|
48
|
+
|
|
49
|
+
if TYPE_CHECKING:
|
|
50
|
+
from simple_salesforce import Salesforce
|
|
51
|
+
|
|
52
|
+
SALESFORCE_API_VERSION = "57.0"
|
|
53
|
+
|
|
54
|
+
# TODO: Add more categories as needed
|
|
55
|
+
ACCEPTED_CATEGORIES: list[str] = ["Account", "Case", "Campaign", "EmailMessage", "Lead"]
|
|
56
|
+
|
|
57
|
+
# Generic minimal email template used only
|
|
58
|
+
# to process EmailMessage records as .eml files
|
|
59
|
+
EMAIL_TEMPLATE = Template(
|
|
60
|
+
"""MIME-Version: 1.0
|
|
61
|
+
Date: $date
|
|
62
|
+
Message-ID: $message_identifier
|
|
63
|
+
Subject: $subject
|
|
64
|
+
From: $from_email
|
|
65
|
+
To: $to_email
|
|
66
|
+
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
|
|
67
|
+
--00000000000095c9b205eff92630
|
|
68
|
+
Content-Type: text/plain; charset="UTF-8"
|
|
69
|
+
$textbody
|
|
70
|
+
--00000000000095c9b205eff92630
|
|
71
|
+
Content-Type: text/html; charset="UTF-8"
|
|
72
|
+
$htmlbody
|
|
73
|
+
--00000000000095c9b205eff92630--
|
|
74
|
+
""",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class SalesforceAccessConfig(AccessConfig):
|
|
80
|
+
consumer_key: str
|
|
81
|
+
private_key: str
|
|
82
|
+
|
|
83
|
+
@requires_dependencies(["cryptography"])
|
|
84
|
+
def get_private_key_value_and_type(self) -> tuple[str, Type]:
|
|
85
|
+
from cryptography.hazmat.primitives import serialization
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None)
|
|
89
|
+
except ValueError:
|
|
90
|
+
pass
|
|
91
|
+
else:
|
|
92
|
+
return self.private_key, str
|
|
93
|
+
|
|
94
|
+
if Path(self.private_key).is_file():
|
|
95
|
+
return self.private_key, Path
|
|
96
|
+
|
|
97
|
+
raise ValueError("private_key does not contain PEM private key or path")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class SalesforceConnectionConfig(ConnectionConfig):
|
|
102
|
+
username: str
|
|
103
|
+
access_config: SalesforceAccessConfig = enhanced_field(sensitive=True)
|
|
104
|
+
|
|
105
|
+
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
106
|
+
def get_client(self) -> "Salesforce":
|
|
107
|
+
from simple_salesforce import Salesforce
|
|
108
|
+
|
|
109
|
+
pkey_value, pkey_type = self.access_config.get_private_key_value_and_type()
|
|
110
|
+
|
|
111
|
+
return Salesforce(
|
|
112
|
+
username=self.username,
|
|
113
|
+
consumer_key=self.access_config.consumer_key,
|
|
114
|
+
privatekey_file=pkey_value if pkey_type is Path else None,
|
|
115
|
+
privatekey=pkey_value if pkey_type is str else None,
|
|
116
|
+
version=SALESFORCE_API_VERSION,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class SalesforceIndexerConfig(IndexerConfig):
|
|
122
|
+
categories: list[str]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclass
|
|
126
|
+
class SalesforceIndexer(Indexer):
|
|
127
|
+
connection_config: SalesforceConnectionConfig
|
|
128
|
+
index_config: SalesforceIndexerConfig
|
|
129
|
+
|
|
130
|
+
def __post_init__(self):
|
|
131
|
+
for record_type in self.index_config.categories:
|
|
132
|
+
if record_type not in ACCEPTED_CATEGORIES:
|
|
133
|
+
raise ValueError(f"{record_type} not currently an accepted Salesforce category")
|
|
134
|
+
|
|
135
|
+
def get_file_extension(self, record_type) -> str:
|
|
136
|
+
if record_type == "EmailMessage":
|
|
137
|
+
extension = ".eml"
|
|
138
|
+
elif record_type in ["Account", "Lead", "Case", "Campaign"]:
|
|
139
|
+
extension = ".xml"
|
|
140
|
+
else:
|
|
141
|
+
raise MissingCategoryError(
|
|
142
|
+
f"There are no categories with the name: {record_type}",
|
|
143
|
+
)
|
|
144
|
+
return extension
|
|
145
|
+
|
|
146
|
+
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
147
|
+
def list_files(self) -> list[FileData]:
|
|
148
|
+
"""Get Salesforce Ids for the records.
|
|
149
|
+
Send them to next phase where each doc gets downloaded into the
|
|
150
|
+
appropriate format for partitioning.
|
|
151
|
+
"""
|
|
152
|
+
from simple_salesforce.exceptions import SalesforceMalformedRequest
|
|
153
|
+
|
|
154
|
+
client = self.connection_config.get_client()
|
|
155
|
+
|
|
156
|
+
files_list = []
|
|
157
|
+
for record_type in self.index_config.categories:
|
|
158
|
+
try:
|
|
159
|
+
# Get ids from Salesforce
|
|
160
|
+
records = client.query_all_iter(
|
|
161
|
+
f"select Id, SystemModstamp, CreatedDate, LastModifiedDate from {record_type}",
|
|
162
|
+
)
|
|
163
|
+
for record in records:
|
|
164
|
+
record_with_extension = record["Id"] + self.get_file_extension(
|
|
165
|
+
record["attributes"]["type"]
|
|
166
|
+
)
|
|
167
|
+
files_list.append(
|
|
168
|
+
FileData(
|
|
169
|
+
connector_type=CONNECTOR_TYPE,
|
|
170
|
+
identifier=record["Id"],
|
|
171
|
+
source_identifiers=SourceIdentifiers(
|
|
172
|
+
filename=record_with_extension,
|
|
173
|
+
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
174
|
+
),
|
|
175
|
+
metadata=DataSourceMetadata(
|
|
176
|
+
url=record["attributes"]["url"],
|
|
177
|
+
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
|
|
178
|
+
date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
|
|
179
|
+
date_modified=str(
|
|
180
|
+
parser.parse(record["LastModifiedDate"]).timestamp()
|
|
181
|
+
),
|
|
182
|
+
record_locator={"id": record["Id"]},
|
|
183
|
+
),
|
|
184
|
+
additional_metadata={"record_type": record["attributes"]["type"]},
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
except SalesforceMalformedRequest as e:
|
|
188
|
+
raise SalesforceMalformedRequest(f"Problem with Salesforce query: {e}")
|
|
189
|
+
|
|
190
|
+
return files_list
|
|
191
|
+
|
|
192
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
193
|
+
for f in self.list_files():
|
|
194
|
+
yield f
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@dataclass
|
|
198
|
+
class SalesforceDownloaderConfig(DownloaderConfig):
|
|
199
|
+
pass
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@dataclass
|
|
203
|
+
class SalesforceDownloader(Downloader):
|
|
204
|
+
connection_config: SalesforceConnectionConfig
|
|
205
|
+
download_config: SalesforceDownloaderConfig = field(
|
|
206
|
+
default_factory=lambda: SalesforceDownloaderConfig()
|
|
207
|
+
)
|
|
208
|
+
connector_type: str = CONNECTOR_TYPE
|
|
209
|
+
|
|
210
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
211
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
212
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
213
|
+
return self.download_dir / Path(rel_path)
|
|
214
|
+
|
|
215
|
+
def _xml_for_record(self, record: OrderedDict) -> str:
|
|
216
|
+
"""Creates partitionable xml file from a record"""
|
|
217
|
+
import xml.etree.ElementTree as ET
|
|
218
|
+
|
|
219
|
+
def create_xml_doc(data, parent, prefix=""):
|
|
220
|
+
for key, value in data.items():
|
|
221
|
+
if isinstance(value, OrderedDict):
|
|
222
|
+
create_xml_doc(value, parent, prefix=f"{prefix}{key}.")
|
|
223
|
+
else:
|
|
224
|
+
item = ET.Element("item")
|
|
225
|
+
item.text = f"{prefix}{key}: {value}"
|
|
226
|
+
parent.append(item)
|
|
227
|
+
|
|
228
|
+
root = ET.Element("root")
|
|
229
|
+
create_xml_doc(record, root)
|
|
230
|
+
|
|
231
|
+
xml_string = ET.tostring(root, encoding="utf-8", xml_declaration=True).decode()
|
|
232
|
+
return xml_string
|
|
233
|
+
|
|
234
|
+
def _eml_for_record(self, email_json: dict[str, Any]) -> str:
|
|
235
|
+
"""Recreates standard expected .eml format using template."""
|
|
236
|
+
eml = EMAIL_TEMPLATE.substitute(
|
|
237
|
+
date=formatdate(parser.parse(email_json.get("MessageDate")).timestamp()),
|
|
238
|
+
message_identifier=email_json.get("MessageIdentifier"),
|
|
239
|
+
subject=email_json.get("Subject"),
|
|
240
|
+
from_email=email_json.get("FromAddress"),
|
|
241
|
+
to_email=email_json.get("ToAddress"),
|
|
242
|
+
textbody=email_json.get("TextBody"),
|
|
243
|
+
htmlbody=email_json.get("HtmlBody"),
|
|
244
|
+
)
|
|
245
|
+
return dedent(eml)
|
|
246
|
+
|
|
247
|
+
@SourceConnectionNetworkError.wrap
|
|
248
|
+
def _get_response(self, file_data: FileData) -> OrderedDict:
|
|
249
|
+
client = self.connection_config.get_client()
|
|
250
|
+
return client.query(
|
|
251
|
+
f"select FIELDS(STANDARD) from {file_data.additional_metadata['record_type']} where Id='{file_data.identifier}'", # noqa: E501
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def get_record(self, file_data: FileData) -> OrderedDict:
|
|
255
|
+
# Get record from Salesforce based on id
|
|
256
|
+
response = self._get_response(file_data)
|
|
257
|
+
logger.debug(f"response was returned for salesforce record id: {file_data.identifier}")
|
|
258
|
+
records = response["records"]
|
|
259
|
+
if not records:
|
|
260
|
+
raise ValueError(
|
|
261
|
+
f"No record found with record id {file_data.identifier}: {json.dumps(response)}"
|
|
262
|
+
)
|
|
263
|
+
record_json = records[0]
|
|
264
|
+
return record_json
|
|
265
|
+
|
|
266
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
267
|
+
record = self.get_record(file_data)
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
if file_data.additional_metadata["record_type"] == "EmailMessage":
|
|
271
|
+
document = self._eml_for_record(record)
|
|
272
|
+
else:
|
|
273
|
+
document = self._xml_for_record(record)
|
|
274
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
275
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
276
|
+
|
|
277
|
+
with open(download_path, "w") as page_file:
|
|
278
|
+
page_file.write(document)
|
|
279
|
+
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
282
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
283
|
+
|
|
284
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
salesforce_source_entry = SourceRegistryEntry(
|
|
288
|
+
connection_config=SalesforceConnectionConfig,
|
|
289
|
+
indexer_config=SalesforceIndexerConfig,
|
|
290
|
+
indexer=SalesforceIndexer,
|
|
291
|
+
downloader_config=SalesforceDownloaderConfig,
|
|
292
|
+
downloader=SalesforceDownloader,
|
|
293
|
+
)
|