unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import typing as t
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from mimetypes import guess_extension
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
13
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
14
|
+
from unstructured_ingest.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
BaseConnectorConfig,
|
|
17
|
+
BaseSessionHandle,
|
|
18
|
+
BaseSingleIngestDoc,
|
|
19
|
+
BaseSourceConnector,
|
|
20
|
+
ConfigSessionHandleMixin,
|
|
21
|
+
IngestDocCleanupMixin,
|
|
22
|
+
IngestDocSessionHandleMixin,
|
|
23
|
+
SourceConnectorCleanupMixin,
|
|
24
|
+
SourceMetadata,
|
|
25
|
+
)
|
|
26
|
+
from unstructured_ingest.logger import logger
|
|
27
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
28
|
+
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
29
|
+
|
|
30
|
+
if t.TYPE_CHECKING:
|
|
31
|
+
from googleapiclient.discovery import Resource as GoogleAPIResource
|
|
32
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
33
|
+
|
|
34
|
+
FILE_FORMAT = "{id}-{name}{ext}"
|
|
35
|
+
DIRECTORY_FORMAT = "{id}-{name}"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class GoogleDriveSessionHandle(BaseSessionHandle):
|
|
40
|
+
service: "GoogleAPIResource"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
44
|
+
def create_service_account_object(key_path: t.Union[str, dict], id=None):
|
|
45
|
+
"""
|
|
46
|
+
Creates a service object for interacting with Google Drive.
|
|
47
|
+
|
|
48
|
+
Providing a drive id enforces a key validation process.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
key_path: Path to Google Drive service account json file. (or the actual json)
|
|
52
|
+
id: ID of a file on Google Drive. File has to be either publicly accessible or accessible
|
|
53
|
+
to the service account.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Service account object
|
|
57
|
+
"""
|
|
58
|
+
from google.auth import default, exceptions
|
|
59
|
+
from google.oauth2 import service_account
|
|
60
|
+
from googleapiclient.discovery import build
|
|
61
|
+
from googleapiclient.errors import HttpError
|
|
62
|
+
|
|
63
|
+
# Service account key can be a dict or a file path(str)
|
|
64
|
+
# But the dict may come in as a string
|
|
65
|
+
key_path = json_to_dict(key_path)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
if isinstance(key_path, dict):
|
|
69
|
+
creds = service_account.Credentials.from_service_account_info(key_path)
|
|
70
|
+
elif isinstance(key_path, str):
|
|
71
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
|
|
72
|
+
creds, _ = default()
|
|
73
|
+
else:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
f"key path not recognized as a dictionary or a file path: "
|
|
76
|
+
f"[{type(key_path)}] {key_path}",
|
|
77
|
+
)
|
|
78
|
+
service = build("drive", "v3", credentials=creds)
|
|
79
|
+
|
|
80
|
+
if id:
|
|
81
|
+
service.files().list(
|
|
82
|
+
spaces="drive",
|
|
83
|
+
fields="files(id)",
|
|
84
|
+
pageToken=None,
|
|
85
|
+
corpora="user",
|
|
86
|
+
q=f"'{id}' in parents",
|
|
87
|
+
).execute()
|
|
88
|
+
|
|
89
|
+
except HttpError as exc:
|
|
90
|
+
raise ValueError(f"{exc.reason}")
|
|
91
|
+
except exceptions.DefaultCredentialsError:
|
|
92
|
+
raise ValueError("The provided API key is invalid.")
|
|
93
|
+
|
|
94
|
+
return service
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass
|
|
98
|
+
class GoogleDriveAccessConfig(AccessConfig):
|
|
99
|
+
service_account_key: t.Union[str, dict] = enhanced_field(sensitive=True)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
|
|
104
|
+
"""Connector config where drive_id is the id of the document to process or
|
|
105
|
+
the folder to process all documents from."""
|
|
106
|
+
|
|
107
|
+
# Google Drive Specific Options
|
|
108
|
+
drive_id: str
|
|
109
|
+
access_config: GoogleDriveAccessConfig
|
|
110
|
+
extension: t.Optional[str] = None
|
|
111
|
+
recursive: bool = False
|
|
112
|
+
|
|
113
|
+
def create_session_handle(
|
|
114
|
+
self,
|
|
115
|
+
) -> GoogleDriveSessionHandle:
|
|
116
|
+
service = create_service_account_object(self.access_config.service_account_key)
|
|
117
|
+
return GoogleDriveSessionHandle(service=service)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
122
|
+
connector_config: SimpleGoogleDriveConfig
|
|
123
|
+
meta: t.Dict[str, str] = field(default_factory=dict)
|
|
124
|
+
registry_name: str = "google_drive"
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def filename(self):
|
|
128
|
+
return Path(self.meta.get("download_filepath")).resolve() # type: ignore
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def _output_filename(self):
|
|
132
|
+
return Path(f"{self.meta.get('output_filepath')}.json").resolve()
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
136
|
+
return {
|
|
137
|
+
"drive_id": self.connector_config.drive_id,
|
|
138
|
+
"file_id": self.meta["id"],
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
142
|
+
def update_source_metadata(self):
|
|
143
|
+
from googleapiclient.errors import HttpError
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
file_obj = (
|
|
147
|
+
self.session_handle.service.files()
|
|
148
|
+
.get(
|
|
149
|
+
fileId=self.meta["id"],
|
|
150
|
+
fields="id, createdTime, modifiedTime, version, webContentLink",
|
|
151
|
+
)
|
|
152
|
+
.execute()
|
|
153
|
+
)
|
|
154
|
+
except HttpError as e:
|
|
155
|
+
if e.status_code == 404:
|
|
156
|
+
logger.error(f"File {self.meta['name']} not found")
|
|
157
|
+
self.source_metadata = SourceMetadata(
|
|
158
|
+
exists=True,
|
|
159
|
+
)
|
|
160
|
+
return
|
|
161
|
+
raise
|
|
162
|
+
|
|
163
|
+
date_created = None
|
|
164
|
+
if dc := file_obj.get("createdTime", ""):
|
|
165
|
+
date_created = datetime.strptime(
|
|
166
|
+
dc,
|
|
167
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
168
|
+
).isoformat()
|
|
169
|
+
|
|
170
|
+
date_modified = None
|
|
171
|
+
if dm := file_obj.get("modifiedTime", ""):
|
|
172
|
+
date_modified = datetime.strptime(
|
|
173
|
+
dm,
|
|
174
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
175
|
+
).isoformat()
|
|
176
|
+
|
|
177
|
+
self.source_metadata = SourceMetadata(
|
|
178
|
+
date_created=date_created,
|
|
179
|
+
date_modified=date_modified,
|
|
180
|
+
version=file_obj.get("version", ""),
|
|
181
|
+
source_url=file_obj.get("webContentLink", ""),
|
|
182
|
+
exists=True,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
@SourceConnectionNetworkError.wrap
|
|
186
|
+
def _run_downloader(self, downloader: "MediaIoBaseDownload") -> bool:
|
|
187
|
+
downloaded = False
|
|
188
|
+
while downloaded is False:
|
|
189
|
+
_, downloaded = downloader.next_chunk()
|
|
190
|
+
return downloaded
|
|
191
|
+
|
|
192
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
193
|
+
@SourceConnectionError.wrap
|
|
194
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
195
|
+
def get_file(self):
|
|
196
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
197
|
+
|
|
198
|
+
if self.meta.get("mimeType", "").startswith("application/vnd.google-apps"):
|
|
199
|
+
export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
|
|
200
|
+
self.meta.get("mimeType"), # type: ignore
|
|
201
|
+
)
|
|
202
|
+
if not export_mime:
|
|
203
|
+
logger.info(
|
|
204
|
+
f"File not supported. Name: {self.meta.get('name')} "
|
|
205
|
+
f"ID: {self.meta.get('id')} "
|
|
206
|
+
f"MimeType: {self.meta.get('mimeType')}",
|
|
207
|
+
)
|
|
208
|
+
return
|
|
209
|
+
|
|
210
|
+
request = self.session_handle.service.files().export_media(
|
|
211
|
+
fileId=self.meta.get("id"),
|
|
212
|
+
mimeType=export_mime,
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
request = self.session_handle.service.files().get_media(fileId=self.meta.get("id"))
|
|
216
|
+
file = io.BytesIO()
|
|
217
|
+
downloader = MediaIoBaseDownload(file, request)
|
|
218
|
+
self.update_source_metadata()
|
|
219
|
+
downloaded = self._run_downloader(downloader=downloader)
|
|
220
|
+
|
|
221
|
+
saved = False
|
|
222
|
+
if downloaded and file:
|
|
223
|
+
dir_ = Path(self.meta["download_dir"])
|
|
224
|
+
if dir_:
|
|
225
|
+
if not dir_.is_dir():
|
|
226
|
+
logger.debug(f"Creating directory: {self.meta.get('download_dir')}")
|
|
227
|
+
|
|
228
|
+
if dir_:
|
|
229
|
+
dir_.mkdir(parents=True, exist_ok=True)
|
|
230
|
+
|
|
231
|
+
with open(self.filename, "wb") as handler:
|
|
232
|
+
handler.write(file.getbuffer())
|
|
233
|
+
saved = True
|
|
234
|
+
logger.debug(f"File downloaded: {self.filename}.")
|
|
235
|
+
if not saved:
|
|
236
|
+
logger.error(f"Error while downloading and saving file: {self.filename}.")
|
|
237
|
+
|
|
238
|
+
def write_result(self):
|
|
239
|
+
"""Write the structured json result for this doc. result must be json serializable."""
|
|
240
|
+
if self.read_config.download_only:
|
|
241
|
+
return
|
|
242
|
+
self._output_filename.parent.mkdir(parents=True, exist_ok=True)
|
|
243
|
+
with open(self._output_filename, "w") as output_f:
|
|
244
|
+
output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
|
|
245
|
+
logger.info(f"Wrote {self._output_filename}")
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@dataclass
|
|
249
|
+
class GoogleDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
250
|
+
"""Objects of this class support fetching documents from Google Drive"""
|
|
251
|
+
|
|
252
|
+
connector_config: SimpleGoogleDriveConfig
|
|
253
|
+
|
|
254
|
+
def _list_objects(self, drive_id, recursive=False):
|
|
255
|
+
files = []
|
|
256
|
+
service = self.connector_config.create_session_handle().service
|
|
257
|
+
|
|
258
|
+
def traverse(drive_id, download_dir, output_dir, recursive=False):
|
|
259
|
+
page_token = None
|
|
260
|
+
while True:
|
|
261
|
+
response = (
|
|
262
|
+
service.files()
|
|
263
|
+
.list(
|
|
264
|
+
spaces="drive",
|
|
265
|
+
fields="nextPageToken, files(id, name, mimeType)",
|
|
266
|
+
pageToken=page_token,
|
|
267
|
+
corpora="user",
|
|
268
|
+
q=f"'{drive_id}' in parents",
|
|
269
|
+
)
|
|
270
|
+
.execute()
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
for meta in response.get("files", []):
|
|
274
|
+
if meta.get("mimeType") == "application/vnd.google-apps.folder":
|
|
275
|
+
dir_ = DIRECTORY_FORMAT.format(name=meta.get("name"), id=meta.get("id"))
|
|
276
|
+
if recursive:
|
|
277
|
+
download_sub_dir = (download_dir / dir_).resolve()
|
|
278
|
+
output_sub_dir = (output_dir / dir_).resolve()
|
|
279
|
+
traverse(meta.get("id"), download_sub_dir, output_sub_dir, True)
|
|
280
|
+
else:
|
|
281
|
+
ext = ""
|
|
282
|
+
if not Path(meta.get("name")).suffixes:
|
|
283
|
+
guess = guess_extension(meta.get("mimeType"))
|
|
284
|
+
ext = guess if guess else ext
|
|
285
|
+
|
|
286
|
+
if meta.get("mimeType", "").startswith("application/vnd.google-apps"):
|
|
287
|
+
export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(meta.get("mimeType"))
|
|
288
|
+
if not export_mime:
|
|
289
|
+
logger.info(
|
|
290
|
+
f"File {meta.get('name')} has an "
|
|
291
|
+
f"unsupported MimeType {meta.get('mimeType')}",
|
|
292
|
+
)
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
if not ext:
|
|
296
|
+
guess = guess_extension(export_mime)
|
|
297
|
+
ext = guess if guess else ext
|
|
298
|
+
|
|
299
|
+
# TODO (Habeeb): Consider filtering at the query level.
|
|
300
|
+
if (
|
|
301
|
+
self.connector_config.extension
|
|
302
|
+
and self.connector_config.extension != ext
|
|
303
|
+
): # noqa: SIM102
|
|
304
|
+
logger.debug(
|
|
305
|
+
f"File {meta.get('name')} does not match "
|
|
306
|
+
f"the file type {self.connector_config.extension}",
|
|
307
|
+
)
|
|
308
|
+
continue
|
|
309
|
+
|
|
310
|
+
name = FILE_FORMAT.format(name=meta.get("name"), id=meta.get("id"), ext=ext)
|
|
311
|
+
meta["download_dir"] = str(download_dir)
|
|
312
|
+
meta["download_filepath"] = (download_dir / name).resolve().as_posix()
|
|
313
|
+
meta["output_dir"] = str(output_dir)
|
|
314
|
+
meta["output_filepath"] = (output_dir / name).resolve().as_posix()
|
|
315
|
+
files.append(meta)
|
|
316
|
+
|
|
317
|
+
page_token = response.get("nextPageToken", None)
|
|
318
|
+
if page_token is None:
|
|
319
|
+
break
|
|
320
|
+
|
|
321
|
+
traverse(
|
|
322
|
+
drive_id,
|
|
323
|
+
Path(self.read_config.download_dir),
|
|
324
|
+
Path(self.processor_config.output_dir),
|
|
325
|
+
recursive,
|
|
326
|
+
)
|
|
327
|
+
return files
|
|
328
|
+
|
|
329
|
+
def initialize(self):
|
|
330
|
+
pass
|
|
331
|
+
|
|
332
|
+
def check_connection(self):
|
|
333
|
+
try:
|
|
334
|
+
self.connector_config.create_session_handle().service
|
|
335
|
+
except Exception as e:
|
|
336
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
337
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
338
|
+
|
|
339
|
+
def get_ingest_docs(self):
|
|
340
|
+
files = self._list_objects(self.connector_config.drive_id, self.connector_config.recursive)
|
|
341
|
+
return [
|
|
342
|
+
GoogleDriveIngestDoc(
|
|
343
|
+
connector_config=self.connector_config,
|
|
344
|
+
processor_config=self.processor_config,
|
|
345
|
+
read_config=self.read_config,
|
|
346
|
+
meta=file,
|
|
347
|
+
)
|
|
348
|
+
for file in files
|
|
349
|
+
]
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from functools import reduce
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
9
|
+
from unstructured_ingest.interfaces import (
|
|
10
|
+
AccessConfig,
|
|
11
|
+
BaseConnectorConfig,
|
|
12
|
+
BaseSessionHandle,
|
|
13
|
+
BaseSingleIngestDoc,
|
|
14
|
+
BaseSourceConnector,
|
|
15
|
+
ConfigSessionHandleMixin,
|
|
16
|
+
IngestDocCleanupMixin,
|
|
17
|
+
IngestDocSessionHandleMixin,
|
|
18
|
+
SourceConnectorCleanupMixin,
|
|
19
|
+
SourceMetadata,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.logger import logger
|
|
22
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
23
|
+
|
|
24
|
+
if t.TYPE_CHECKING:
|
|
25
|
+
from hubspot import HubSpot
|
|
26
|
+
|
|
27
|
+
CONTENT_TAG = "content"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class HubSpotObjectTypes(Enum):
|
|
31
|
+
CALLS = "calls"
|
|
32
|
+
COMMUNICATIONS = "communications"
|
|
33
|
+
EMAILS = "emails"
|
|
34
|
+
NOTES = "notes"
|
|
35
|
+
PRODUCTS = "products"
|
|
36
|
+
TICKETS = "tickets"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class HubSpotSessionHandle(BaseSessionHandle):
|
|
41
|
+
service: "HubSpot"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class HubSpotAccessConfig(AccessConfig):
|
|
46
|
+
api_token: str = enhanced_field(repr=False, sensitive=True)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class SimpleHubSpotConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
|
|
51
|
+
access_config: HubSpotAccessConfig
|
|
52
|
+
params: t.Optional[str] = None
|
|
53
|
+
properties: t.Optional[dict] = None
|
|
54
|
+
object_types: t.Optional[t.List[str]] = None
|
|
55
|
+
custom_properties: t.Optional[t.Dict[str, t.List[str]]] = None
|
|
56
|
+
|
|
57
|
+
@requires_dependencies(["hubspot"], extras="hubspot")
|
|
58
|
+
def create_session_handle(self) -> HubSpotSessionHandle:
|
|
59
|
+
from hubspot import HubSpot
|
|
60
|
+
|
|
61
|
+
service = HubSpot(access_token=self.access_config.api_token)
|
|
62
|
+
return HubSpotSessionHandle(service=service)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class HubSpotIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
67
|
+
connector_config: SimpleHubSpotConfig
|
|
68
|
+
object_id: str
|
|
69
|
+
object_type: str
|
|
70
|
+
content_properties: t.List[str]
|
|
71
|
+
registry_name: str = "hubspot"
|
|
72
|
+
|
|
73
|
+
def __post_init__(self):
|
|
74
|
+
self._add_custom_properties()
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def filename(self):
|
|
78
|
+
return (
|
|
79
|
+
Path(self.read_config.download_dir)
|
|
80
|
+
/ f"{self.object_type}/{self.object_id}.txt" # type: ignore
|
|
81
|
+
).resolve()
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def _output_filename(self):
|
|
85
|
+
return (
|
|
86
|
+
Path(self.processor_config.output_dir)
|
|
87
|
+
/ f"{self.object_type}/{self.object_id}.json" # type: ignore
|
|
88
|
+
).resolve()
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
92
|
+
return {
|
|
93
|
+
f"{self.registry_name}_id": self.object_id,
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def version(self) -> t.Optional[str]:
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def source_url(self) -> t.Optional[str]:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
def _add_custom_properties(self):
|
|
105
|
+
if (self.connector_config.custom_properties is not None) and (
|
|
106
|
+
(cprops := self.connector_config.custom_properties.get(self.object_type)) is not None
|
|
107
|
+
):
|
|
108
|
+
self.content_properties += cprops
|
|
109
|
+
|
|
110
|
+
def _join_object_properties(self, obj) -> str:
|
|
111
|
+
return "\n".join(
|
|
112
|
+
[
|
|
113
|
+
obj.properties[cprop]
|
|
114
|
+
for cprop in self.content_properties
|
|
115
|
+
if (obj.properties.get(cprop) is not None)
|
|
116
|
+
],
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def _resolve_getter(self):
|
|
120
|
+
method_path = ""
|
|
121
|
+
if self.object_type in [
|
|
122
|
+
HubSpotObjectTypes.CALLS.value,
|
|
123
|
+
HubSpotObjectTypes.COMMUNICATIONS.value,
|
|
124
|
+
HubSpotObjectTypes.EMAILS.value,
|
|
125
|
+
HubSpotObjectTypes.NOTES.value,
|
|
126
|
+
]:
|
|
127
|
+
method_path = f"crm.objects.{self.object_type}.basic_api.get_by_id"
|
|
128
|
+
if self.object_type in [
|
|
129
|
+
HubSpotObjectTypes.PRODUCTS.value,
|
|
130
|
+
HubSpotObjectTypes.TICKETS.value,
|
|
131
|
+
]:
|
|
132
|
+
method_path = f"crm.{self.object_type}.basic_api.get_by_id"
|
|
133
|
+
|
|
134
|
+
method = reduce(getattr, method_path.split("."), self.session_handle.service)
|
|
135
|
+
return method
|
|
136
|
+
|
|
137
|
+
@requires_dependencies(["hubspot"], extras="hubspot")
|
|
138
|
+
def _fetch_obj(self, check_only=False):
|
|
139
|
+
from hubspot.crm.objects.exceptions import NotFoundException
|
|
140
|
+
|
|
141
|
+
get_by_id_method = self._resolve_getter()
|
|
142
|
+
try:
|
|
143
|
+
response = get_by_id_method(
|
|
144
|
+
self.object_id,
|
|
145
|
+
properties=([] if check_only else self.content_properties),
|
|
146
|
+
)
|
|
147
|
+
except NotFoundException as e:
|
|
148
|
+
logger.error(e)
|
|
149
|
+
return None
|
|
150
|
+
return response
|
|
151
|
+
|
|
152
|
+
def update_source_metadata(self, **kwargs) -> None:
|
|
153
|
+
obj = kwargs.get("object", self._fetch_obj(check_only=True)) # type: ignore
|
|
154
|
+
if obj is None:
|
|
155
|
+
self.source_metadata = SourceMetadata(
|
|
156
|
+
exists=False,
|
|
157
|
+
)
|
|
158
|
+
return
|
|
159
|
+
self.source_metadata = SourceMetadata(
|
|
160
|
+
date_created=obj.created_at.isoformat(),
|
|
161
|
+
date_modified=obj.updated_at.isoformat(),
|
|
162
|
+
exists=True,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
@SourceConnectionError.wrap
|
|
166
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
167
|
+
def get_file(self):
|
|
168
|
+
obj = self._fetch_obj()
|
|
169
|
+
if obj is None:
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"Failed to retrieve object {self.registry_name}",
|
|
172
|
+
f"with ID {self.object_id}",
|
|
173
|
+
)
|
|
174
|
+
self.update_source_metadata(object=obj)
|
|
175
|
+
output = self._join_object_properties(obj)
|
|
176
|
+
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
177
|
+
with open(self.filename, "w", encoding="utf8") as f:
|
|
178
|
+
f.write(output)
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@dataclass
|
|
183
|
+
class HubSpotSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
184
|
+
connector_config: SimpleHubSpotConfig
|
|
185
|
+
|
|
186
|
+
def initialize(self):
|
|
187
|
+
self.hubspot = self.connector_config.create_session_handle().service
|
|
188
|
+
|
|
189
|
+
def check_connection(self):
|
|
190
|
+
return self.connector_config.create_session_handle().service
|
|
191
|
+
|
|
192
|
+
@requires_dependencies(["hubspot"], extras="hubspot")
|
|
193
|
+
def _list_objects(self, get_page_method, object_type: str, content_properties: t.List[str]):
|
|
194
|
+
try:
|
|
195
|
+
objects = get_page_method()
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.error(e)
|
|
198
|
+
logger.error(
|
|
199
|
+
f"Failed to retrieve {object_type}, omitting processing...",
|
|
200
|
+
)
|
|
201
|
+
return []
|
|
202
|
+
return [
|
|
203
|
+
HubSpotIngestDoc(
|
|
204
|
+
connector_config=self.connector_config,
|
|
205
|
+
processor_config=self.processor_config,
|
|
206
|
+
read_config=self.read_config,
|
|
207
|
+
object_id=obj.id,
|
|
208
|
+
object_type=object_type,
|
|
209
|
+
content_properties=content_properties,
|
|
210
|
+
)
|
|
211
|
+
for obj in objects.results
|
|
212
|
+
]
|
|
213
|
+
|
|
214
|
+
def _get_calls(self) -> t.List[HubSpotIngestDoc]:
|
|
215
|
+
return self._list_objects(
|
|
216
|
+
self.hubspot.crm.objects.calls.basic_api.get_page,
|
|
217
|
+
HubSpotObjectTypes.CALLS.value,
|
|
218
|
+
["hs_call_title", "hs_call_body"],
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
def _get_communications(self) -> t.List[HubSpotIngestDoc]:
|
|
222
|
+
return self._list_objects(
|
|
223
|
+
self.hubspot.crm.objects.communications.basic_api.get_page,
|
|
224
|
+
HubSpotObjectTypes.COMMUNICATIONS.value,
|
|
225
|
+
["hs_communication_body"],
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
def _get_emails(self) -> t.List[HubSpotIngestDoc]:
|
|
229
|
+
return self._list_objects(
|
|
230
|
+
self.hubspot.crm.objects.emails.basic_api.get_page,
|
|
231
|
+
HubSpotObjectTypes.EMAILS.value,
|
|
232
|
+
["hs_email_subject", "hs_email_text"],
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def _get_notes(self) -> t.List[HubSpotIngestDoc]:
|
|
236
|
+
return self._list_objects(
|
|
237
|
+
self.hubspot.crm.objects.notes.basic_api.get_page,
|
|
238
|
+
HubSpotObjectTypes.NOTES.value,
|
|
239
|
+
["hs_note_body"],
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
def _get_products(self) -> t.List[HubSpotIngestDoc]:
|
|
243
|
+
return self._list_objects(
|
|
244
|
+
self.hubspot.crm.products.basic_api.get_page,
|
|
245
|
+
HubSpotObjectTypes.PRODUCTS.value,
|
|
246
|
+
["description"],
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def _get_tickets(self) -> t.List[HubSpotIngestDoc]:
|
|
250
|
+
return self._list_objects(
|
|
251
|
+
self.hubspot.crm.tickets.basic_api.get_page,
|
|
252
|
+
HubSpotObjectTypes.TICKETS.value,
|
|
253
|
+
["subject", "content"],
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
def get_ingest_docs(self):
|
|
257
|
+
obj_method_resolver = {
|
|
258
|
+
HubSpotObjectTypes.CALLS.value: self._get_calls,
|
|
259
|
+
HubSpotObjectTypes.COMMUNICATIONS.value: self._get_communications,
|
|
260
|
+
HubSpotObjectTypes.EMAILS.value: self._get_emails,
|
|
261
|
+
HubSpotObjectTypes.NOTES.value: self._get_notes,
|
|
262
|
+
HubSpotObjectTypes.PRODUCTS.value: self._get_products,
|
|
263
|
+
HubSpotObjectTypes.TICKETS.value: self._get_tickets,
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
if self.connector_config.object_types is not None:
|
|
267
|
+
obj_method_resolver = {
|
|
268
|
+
obj_name: obj_method_resolver.get(obj_name) # type: ignore
|
|
269
|
+
for obj_name in self.connector_config.object_types
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
ingest_docs: t.List[HubSpotIngestDoc] = []
|
|
273
|
+
for obj_name, obj_method in obj_method_resolver.items():
|
|
274
|
+
logger.info(f"Retrieving - {obj_name}")
|
|
275
|
+
results: t.List[HubSpotIngestDoc] = obj_method() # type: ignore
|
|
276
|
+
ingest_docs += results # type: ignore
|
|
277
|
+
|
|
278
|
+
return ingest_docs
|