unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
9
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
10
|
+
from unstructured_ingest.interfaces import (
|
|
11
|
+
AccessConfig,
|
|
12
|
+
BaseConnectorConfig,
|
|
13
|
+
BaseSingleIngestDoc,
|
|
14
|
+
BaseSourceConnector,
|
|
15
|
+
IngestDocCleanupMixin,
|
|
16
|
+
RetryStrategyConfig,
|
|
17
|
+
SourceConnectorCleanupMixin,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.logger import logger
|
|
20
|
+
from unstructured_ingest.utils.dep_check import (
|
|
21
|
+
requires_dependencies,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
NOTION_API_VERSION = "2022-06-28"
|
|
25
|
+
if t.TYPE_CHECKING:
|
|
26
|
+
from unstructured_ingest.connector.notion.client import Client as NotionClient
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class NotionAccessConfig(AccessConfig):
|
|
31
|
+
notion_api_key: str = enhanced_field(sensitive=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class SimpleNotionConfig(BaseConnectorConfig):
|
|
36
|
+
"""Connector config to process all messages by channel id's."""
|
|
37
|
+
|
|
38
|
+
access_config: NotionAccessConfig
|
|
39
|
+
page_ids: t.Optional[t.List[str]] = None
|
|
40
|
+
database_ids: t.Optional[t.List[str]] = None
|
|
41
|
+
recursive: bool = False
|
|
42
|
+
|
|
43
|
+
def __post_init__(self):
|
|
44
|
+
if self.page_ids:
|
|
45
|
+
self.page_ids = [str(UUID(p.strip())) for p in self.page_ids]
|
|
46
|
+
|
|
47
|
+
if self.database_ids:
|
|
48
|
+
self.database_ids = [str(UUID(d.strip())) for d in self.database_ids]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class NotionPageIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
53
|
+
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
54
|
+
doing the processing!).
|
|
55
|
+
|
|
56
|
+
Also includes a cleanup method. When things go wrong and the cleanup
|
|
57
|
+
method is not called, the file is left behind on the filesystem to assist debugging.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
page_id: str
|
|
61
|
+
connector_config: SimpleNotionConfig
|
|
62
|
+
registry_name: str = "notion_page"
|
|
63
|
+
retry_strategy_config: t.Optional[RetryStrategyConfig] = None
|
|
64
|
+
|
|
65
|
+
def _tmp_download_file(self):
|
|
66
|
+
page_file = self.page_id + ".html"
|
|
67
|
+
return Path(self.read_config.download_dir) / page_file
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def _output_filename(self):
|
|
71
|
+
page_file = self.page_id + ".json"
|
|
72
|
+
return Path(self.processor_config.output_dir) / page_file
|
|
73
|
+
|
|
74
|
+
def _create_full_tmp_dir_path(self):
|
|
75
|
+
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
|
|
77
|
+
@requires_dependencies(dependencies=["notion_client"], extras="notion")
|
|
78
|
+
def get_client(self):
|
|
79
|
+
from unstructured_ingest.connector.notion.client import Client as NotionClient
|
|
80
|
+
|
|
81
|
+
# Pin the version of the api to avoid schema changes
|
|
82
|
+
return NotionClient(
|
|
83
|
+
notion_version=NOTION_API_VERSION,
|
|
84
|
+
auth=self.connector_config.access_config.notion_api_key,
|
|
85
|
+
logger=logger,
|
|
86
|
+
log_level=logger.level,
|
|
87
|
+
retry_strategy_config=self.retry_strategy_config,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
91
|
+
@requires_dependencies(dependencies=["notion_client"], extras="notion")
|
|
92
|
+
def get_file(self):
|
|
93
|
+
from notion_client import APIErrorCode, APIResponseError
|
|
94
|
+
|
|
95
|
+
from unstructured_ingest.connector.notion.helpers import extract_page_html
|
|
96
|
+
|
|
97
|
+
self._create_full_tmp_dir_path()
|
|
98
|
+
|
|
99
|
+
client = self.get_client()
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
text_extraction = extract_page_html(
|
|
103
|
+
client=client,
|
|
104
|
+
page_id=self.page_id,
|
|
105
|
+
logger=logger,
|
|
106
|
+
)
|
|
107
|
+
self.check_exists = True
|
|
108
|
+
self.file_exists = True
|
|
109
|
+
if html := text_extraction.html:
|
|
110
|
+
with open(self._tmp_download_file(), "w") as page_file:
|
|
111
|
+
page_file.write(html.render(pretty=True))
|
|
112
|
+
|
|
113
|
+
except APIResponseError as error:
|
|
114
|
+
if error.code == APIErrorCode.ObjectNotFound:
|
|
115
|
+
self.check_exists = True
|
|
116
|
+
self.file_exists = False
|
|
117
|
+
else:
|
|
118
|
+
logger.error(f"Error: {error}")
|
|
119
|
+
|
|
120
|
+
@requires_dependencies(dependencies=["notion_client"], extras="notion")
|
|
121
|
+
def get_file_metadata(self):
|
|
122
|
+
from notion_client import APIErrorCode, APIResponseError
|
|
123
|
+
|
|
124
|
+
client = self.get_client()
|
|
125
|
+
|
|
126
|
+
# The Notion block endpoint gives more hierarchical information (parent,child relationships)
|
|
127
|
+
# than the pages endpoint so choosing to use that one to get metadata about the page
|
|
128
|
+
try:
|
|
129
|
+
self.file_metadata = client.pages.retrieve(page_id=self.page_id) # type: ignore
|
|
130
|
+
self.check_exists = True
|
|
131
|
+
self.file_exists = True
|
|
132
|
+
except APIResponseError as error:
|
|
133
|
+
if error.code == APIErrorCode.ObjectNotFound:
|
|
134
|
+
self.check_exists = True
|
|
135
|
+
self.file_exists = False
|
|
136
|
+
else:
|
|
137
|
+
logger.error(f"Error: {error}")
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def date_created(self) -> t.Optional[str]:
|
|
141
|
+
"""The date the document was created on the source system."""
|
|
142
|
+
if not hasattr(self, "file_metadata") or not self.file_metadata:
|
|
143
|
+
self.get_file_metadata()
|
|
144
|
+
|
|
145
|
+
return self.file_metadata.created_time if self.file_metadata else None
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def date_modified(self) -> t.Optional[str]:
|
|
149
|
+
"""The date the document was last modified on the source system."""
|
|
150
|
+
if not hasattr(self, "file_metadata") or not self.file_metadata:
|
|
151
|
+
self.get_file_metadata()
|
|
152
|
+
|
|
153
|
+
return self.file_metadata.last_edited_time if self.file_metadata else None
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def exists(self) -> t.Optional[bool]:
|
|
157
|
+
"""Whether the document exists on the remote source."""
|
|
158
|
+
if self.check_exists:
|
|
159
|
+
return self.file_exists
|
|
160
|
+
|
|
161
|
+
self.get_file_metadata()
|
|
162
|
+
|
|
163
|
+
return self.file_exists
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def filename(self):
|
|
167
|
+
"""The filename of the file created from a notion page"""
|
|
168
|
+
return self._tmp_download_file()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@dataclass
|
|
172
|
+
class NotionDatabaseIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
173
|
+
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
174
|
+
doing the processing!).
|
|
175
|
+
|
|
176
|
+
Also includes a cleanup method. When things go wrong and the cleanup
|
|
177
|
+
method is not called, the file is left behind on the filesystem to assist debugging.
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
database_id: str
|
|
181
|
+
connector_config: SimpleNotionConfig
|
|
182
|
+
retry_strategy_config: t.Optional[RetryStrategyConfig] = None
|
|
183
|
+
registry_name: str = "notion_database"
|
|
184
|
+
|
|
185
|
+
def _tmp_download_file(self):
|
|
186
|
+
page_file = self.database_id + ".html"
|
|
187
|
+
return Path(self.read_config.download_dir) / page_file
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def _output_filename(self):
|
|
191
|
+
page_file = self.database_id + ".json"
|
|
192
|
+
return Path(self.processor_config.output_dir) / page_file
|
|
193
|
+
|
|
194
|
+
def _create_full_tmp_dir_path(self):
|
|
195
|
+
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
|
196
|
+
|
|
197
|
+
@requires_dependencies(dependencies=["notion_client"], extras="notion")
|
|
198
|
+
def get_client(self):
|
|
199
|
+
from unstructured_ingest.connector.notion.client import Client as NotionClient
|
|
200
|
+
|
|
201
|
+
# Pin the version of the api to avoid schema changes
|
|
202
|
+
return NotionClient(
|
|
203
|
+
notion_version=NOTION_API_VERSION,
|
|
204
|
+
auth=self.connector_config.access_config.notion_api_key,
|
|
205
|
+
logger=logger,
|
|
206
|
+
log_level=logger.level,
|
|
207
|
+
retry_strategy_config=self.retry_strategy_config,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
211
|
+
@requires_dependencies(dependencies=["notion_client"], extras="notion")
|
|
212
|
+
def get_file(self):
|
|
213
|
+
from notion_client import APIErrorCode, APIResponseError
|
|
214
|
+
|
|
215
|
+
from unstructured_ingest.connector.notion.helpers import extract_database_html
|
|
216
|
+
|
|
217
|
+
self._create_full_tmp_dir_path()
|
|
218
|
+
|
|
219
|
+
client = self.get_client()
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
text_extraction = extract_database_html(
|
|
223
|
+
client=client,
|
|
224
|
+
database_id=self.database_id,
|
|
225
|
+
logger=logger,
|
|
226
|
+
)
|
|
227
|
+
self.check_exists = True
|
|
228
|
+
self.file_exists = True
|
|
229
|
+
if html := text_extraction.html:
|
|
230
|
+
with open(self._tmp_download_file(), "w") as page_file:
|
|
231
|
+
page_file.write(html.render(pretty=True))
|
|
232
|
+
|
|
233
|
+
except APIResponseError as error:
|
|
234
|
+
if error.code == APIErrorCode.ObjectNotFound:
|
|
235
|
+
self.check_exists = True
|
|
236
|
+
self.file_exists = False
|
|
237
|
+
else:
|
|
238
|
+
logger.error(f"Error: {error}")
|
|
239
|
+
|
|
240
|
+
@requires_dependencies(dependencies=["notion_client"], extras="notion")
|
|
241
|
+
def get_file_metadata(self):
|
|
242
|
+
from notion_client import APIErrorCode, APIResponseError
|
|
243
|
+
|
|
244
|
+
client = self.get_client()
|
|
245
|
+
|
|
246
|
+
# The Notion block endpoint gives more hierarchical information (parent,child relationships)
|
|
247
|
+
# than the pages endpoint so choosing to use that one to get metadata about the page
|
|
248
|
+
try:
|
|
249
|
+
self.file_metadata = client.databases.retrieve(
|
|
250
|
+
database_id=self.database_id,
|
|
251
|
+
) # type: ignore
|
|
252
|
+
self.check_exists = True
|
|
253
|
+
self.file_exists = True
|
|
254
|
+
except APIResponseError as error:
|
|
255
|
+
if error.code == APIErrorCode.ObjectNotFound:
|
|
256
|
+
self.check_exists = True
|
|
257
|
+
self.file_exists = False
|
|
258
|
+
else:
|
|
259
|
+
logger.error(f"Error: {error}")
|
|
260
|
+
|
|
261
|
+
@property
|
|
262
|
+
def date_created(self) -> t.Optional[str]:
|
|
263
|
+
"""The date the document was created on the source system."""
|
|
264
|
+
if not hasattr(self, "file_metadata") or not self.file_metadata:
|
|
265
|
+
self.get_file_metadata()
|
|
266
|
+
|
|
267
|
+
return self.file_metadata.created_time if self.file_metadata else None
|
|
268
|
+
|
|
269
|
+
@property
|
|
270
|
+
def date_modified(self) -> t.Optional[str]:
|
|
271
|
+
"""The date the document was last modified on the source system."""
|
|
272
|
+
if not hasattr(self, "file_metadata") or not self.file_metadata:
|
|
273
|
+
self.get_file_metadata()
|
|
274
|
+
|
|
275
|
+
return self.file_metadata.last_edited_time if self.file_metadata else None
|
|
276
|
+
|
|
277
|
+
@property
|
|
278
|
+
def exists(self) -> t.Optional[bool]:
|
|
279
|
+
"""Whether the document exists on the remote source."""
|
|
280
|
+
if self.check_exists:
|
|
281
|
+
return self.file_exists
|
|
282
|
+
|
|
283
|
+
self.get_file_metadata()
|
|
284
|
+
|
|
285
|
+
return self.file_exists
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def filename(self):
|
|
289
|
+
"""The filename of the file created from a notion page"""
|
|
290
|
+
return self._tmp_download_file()
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@dataclass
|
|
294
|
+
class NotionSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
295
|
+
"""Objects of this class support fetching document(s) from"""
|
|
296
|
+
|
|
297
|
+
connector_config: SimpleNotionConfig
|
|
298
|
+
retry_strategy_config: t.Optional[RetryStrategyConfig] = None
|
|
299
|
+
_client: t.Optional["NotionClient"] = field(init=False, default=None)
|
|
300
|
+
|
|
301
|
+
@property
|
|
302
|
+
def client(self) -> "NotionClient":
|
|
303
|
+
if self._client is None:
|
|
304
|
+
self._client = self.create_client()
|
|
305
|
+
return self._client
|
|
306
|
+
|
|
307
|
+
@requires_dependencies(dependencies=["notion_client"], extras="notion")
|
|
308
|
+
def create_client(self) -> "NotionClient":
|
|
309
|
+
from unstructured_ingest.connector.notion.client import Client as NotionClient
|
|
310
|
+
|
|
311
|
+
return NotionClient(
|
|
312
|
+
notion_version=NOTION_API_VERSION,
|
|
313
|
+
auth=self.connector_config.access_config.notion_api_key,
|
|
314
|
+
logger=logger,
|
|
315
|
+
log_level=logger.level,
|
|
316
|
+
retry_strategy_config=self.retry_strategy_config,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
def check_connection(self):
|
|
320
|
+
try:
|
|
321
|
+
request = self.client._build_request("HEAD", "users")
|
|
322
|
+
response = self.client.client.send(request)
|
|
323
|
+
response.raise_for_status()
|
|
324
|
+
except httpx.HTTPStatusError as http_error:
|
|
325
|
+
logger.error(f"failed to validate connection: {http_error}", exc_info=True)
|
|
326
|
+
raise SourceConnectionError(f"failed to validate connection: {http_error}")
|
|
327
|
+
|
|
328
|
+
@requires_dependencies(dependencies=["notion_client"], extras="notion")
|
|
329
|
+
def initialize(self):
|
|
330
|
+
"""Verify that can get metadata for an object, validates connections info."""
|
|
331
|
+
_ = self.client
|
|
332
|
+
|
|
333
|
+
@requires_dependencies(dependencies=["notion_client"], extras="notion")
|
|
334
|
+
def get_child_page_content(self, page_id: str):
|
|
335
|
+
from unstructured_ingest.connector.notion.helpers import (
|
|
336
|
+
get_recursive_content_from_page,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# sanity check that database id is valid
|
|
340
|
+
resp_code = self.client.pages.retrieve_status(page_id=page_id)
|
|
341
|
+
if resp_code != 200:
|
|
342
|
+
raise ValueError(
|
|
343
|
+
f"page associated with page id could not be found: {page_id}",
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
child_content = get_recursive_content_from_page(
|
|
347
|
+
client=self.client,
|
|
348
|
+
page_id=page_id,
|
|
349
|
+
logger=logger,
|
|
350
|
+
)
|
|
351
|
+
return child_content
|
|
352
|
+
|
|
353
|
+
def get_child_content(self, page_id: str):
|
|
354
|
+
from unstructured_ingest.connector.notion.helpers import (
|
|
355
|
+
get_recursive_content_from_page,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
child_content = get_recursive_content_from_page(
|
|
359
|
+
client=self.client,
|
|
360
|
+
page_id=page_id,
|
|
361
|
+
logger=logger,
|
|
362
|
+
)
|
|
363
|
+
return child_content
|
|
364
|
+
|
|
365
|
+
@requires_dependencies(dependencies=["notion_client"], extras="notion")
|
|
366
|
+
def get_child_database_content(self, database_id: str):
|
|
367
|
+
from unstructured_ingest.connector.notion.helpers import (
|
|
368
|
+
get_recursive_content_from_database,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# sanity check that database id is valid
|
|
372
|
+
resp_code = self.client.databases.retrieve_status(database_id=database_id)
|
|
373
|
+
if resp_code != 200:
|
|
374
|
+
raise ValueError(
|
|
375
|
+
f"database associated with database id could not be found: {database_id}",
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
child_content = get_recursive_content_from_database(
|
|
379
|
+
client=self.client,
|
|
380
|
+
database_id=database_id,
|
|
381
|
+
logger=logger,
|
|
382
|
+
)
|
|
383
|
+
return child_content
|
|
384
|
+
|
|
385
|
+
def get_ingest_docs(self):
|
|
386
|
+
docs: t.List[BaseSingleIngestDoc] = []
|
|
387
|
+
if self.connector_config.page_ids:
|
|
388
|
+
docs += [
|
|
389
|
+
NotionPageIngestDoc(
|
|
390
|
+
connector_config=self.connector_config,
|
|
391
|
+
processor_config=self.processor_config,
|
|
392
|
+
retry_strategy_config=self.retry_strategy_config,
|
|
393
|
+
read_config=self.read_config,
|
|
394
|
+
page_id=page_id,
|
|
395
|
+
)
|
|
396
|
+
for page_id in self.connector_config.page_ids
|
|
397
|
+
]
|
|
398
|
+
if self.connector_config.database_ids:
|
|
399
|
+
docs += [
|
|
400
|
+
NotionDatabaseIngestDoc(
|
|
401
|
+
connector_config=self.connector_config,
|
|
402
|
+
processor_config=self.processor_config,
|
|
403
|
+
retry_strategy_config=self.retry_strategy_config,
|
|
404
|
+
read_config=self.read_config,
|
|
405
|
+
database_id=database_id,
|
|
406
|
+
)
|
|
407
|
+
for database_id in self.connector_config.database_ids
|
|
408
|
+
]
|
|
409
|
+
if self.connector_config.recursive:
|
|
410
|
+
logger.info("Getting recursive content")
|
|
411
|
+
child_pages = []
|
|
412
|
+
child_databases = []
|
|
413
|
+
if self.connector_config.page_ids:
|
|
414
|
+
for page_id in self.connector_config.page_ids:
|
|
415
|
+
child_content = self.get_child_page_content(page_id=page_id)
|
|
416
|
+
child_pages.extend(child_content.child_pages)
|
|
417
|
+
child_databases.extend(child_content.child_databases)
|
|
418
|
+
|
|
419
|
+
if self.connector_config.database_ids:
|
|
420
|
+
for database_id in self.connector_config.database_ids:
|
|
421
|
+
child_content = self.get_child_database_content(database_id=database_id)
|
|
422
|
+
child_pages.extend(child_content.child_pages)
|
|
423
|
+
child_databases.extend(child_content.child_databases)
|
|
424
|
+
|
|
425
|
+
# Remove duplicates
|
|
426
|
+
child_pages = list(set(child_pages))
|
|
427
|
+
if self.connector_config.page_ids:
|
|
428
|
+
child_pages = [c for c in child_pages if c not in self.connector_config.page_ids]
|
|
429
|
+
|
|
430
|
+
child_databases = list(set(child_databases))
|
|
431
|
+
if self.connector_config.database_ids:
|
|
432
|
+
child_databases = [
|
|
433
|
+
db for db in child_databases if db not in self.connector_config.database_ids
|
|
434
|
+
]
|
|
435
|
+
|
|
436
|
+
if child_pages:
|
|
437
|
+
logger.info(
|
|
438
|
+
"Adding the following child page ids: {}".format(", ".join(child_pages)),
|
|
439
|
+
)
|
|
440
|
+
docs += [
|
|
441
|
+
NotionPageIngestDoc(
|
|
442
|
+
connector_config=self.connector_config,
|
|
443
|
+
processor_config=self.processor_config,
|
|
444
|
+
retry_strategy_config=self.retry_strategy_config,
|
|
445
|
+
read_config=self.read_config,
|
|
446
|
+
page_id=page_id,
|
|
447
|
+
)
|
|
448
|
+
for page_id in child_pages
|
|
449
|
+
]
|
|
450
|
+
|
|
451
|
+
if child_databases:
|
|
452
|
+
logger.info(
|
|
453
|
+
"Adding the following child database ids: {}".format(
|
|
454
|
+
", ".join(child_databases),
|
|
455
|
+
),
|
|
456
|
+
)
|
|
457
|
+
docs += [
|
|
458
|
+
NotionDatabaseIngestDoc(
|
|
459
|
+
connector_config=self.connector_config,
|
|
460
|
+
processor_config=self.processor_config,
|
|
461
|
+
retry_strategy_config=self.retry_strategy_config,
|
|
462
|
+
read_config=self.read_config,
|
|
463
|
+
database_id=database_id,
|
|
464
|
+
)
|
|
465
|
+
for database_id in child_databases
|
|
466
|
+
]
|
|
467
|
+
|
|
468
|
+
return docs
|