unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Optional, TypedDict
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.logger import logger
|
|
10
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
12
|
+
from unstructured_ingest.v2.processes.partitioner import Partitioner
|
|
13
|
+
|
|
14
|
+
STEP_ID = "partition"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PartitionStepResponse(TypedDict):
|
|
18
|
+
file_data_path: str
|
|
19
|
+
path: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class PartitionStep(PipelineStep):
|
|
24
|
+
process: Partitioner
|
|
25
|
+
identifier: str = STEP_ID
|
|
26
|
+
|
|
27
|
+
def __str__(self):
|
|
28
|
+
return f"{self.identifier} ({self.process.config.strategy})"
|
|
29
|
+
|
|
30
|
+
def __post_init__(self):
|
|
31
|
+
config = sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
|
|
32
|
+
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
33
|
+
|
|
34
|
+
def should_partition(self, filepath: Path, file_data: FileData) -> bool:
|
|
35
|
+
if self.context.reprocess or file_data.reprocess:
|
|
36
|
+
return True
|
|
37
|
+
return not filepath.exists()
|
|
38
|
+
|
|
39
|
+
def get_output_filepath(self, filename: Path) -> Path:
|
|
40
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
|
|
41
|
+
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
42
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
return filepath
|
|
44
|
+
|
|
45
|
+
def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
|
|
46
|
+
with open(str(output_filepath), "w") as f:
|
|
47
|
+
logger.debug(f"Writing partitioned output to: {output_filepath}")
|
|
48
|
+
json.dump(partitioned_content, f, indent=2)
|
|
49
|
+
|
|
50
|
+
async def _run_async(
|
|
51
|
+
self, fn: Callable, path: str, file_data_path: str
|
|
52
|
+
) -> Optional[PartitionStepResponse]:
|
|
53
|
+
path = Path(path)
|
|
54
|
+
file_data = FileData.from_file(path=file_data_path)
|
|
55
|
+
output_filepath = self.get_output_filepath(filename=Path(file_data_path))
|
|
56
|
+
if not self.should_partition(filepath=output_filepath, file_data=file_data):
|
|
57
|
+
logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
|
|
58
|
+
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
59
|
+
fn_kwargs = {"filename": path, "metadata": file_data.metadata}
|
|
60
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
61
|
+
partitioned_content = fn(**fn_kwargs)
|
|
62
|
+
elif semaphore := self.context.semaphore:
|
|
63
|
+
async with semaphore:
|
|
64
|
+
partitioned_content = await fn(**fn_kwargs)
|
|
65
|
+
else:
|
|
66
|
+
partitioned_content = await fn(**fn_kwargs)
|
|
67
|
+
self._save_output(
|
|
68
|
+
output_filepath=str(output_filepath), partitioned_content=partitioned_content
|
|
69
|
+
)
|
|
70
|
+
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
71
|
+
|
|
72
|
+
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
73
|
+
hashable_string = json.dumps(
|
|
74
|
+
self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
|
|
75
|
+
)
|
|
76
|
+
if extras:
|
|
77
|
+
hashable_string += "".join(extras)
|
|
78
|
+
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Optional, TypedDict
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
|
|
10
|
+
from unstructured_ingest.v2.logger import logger
|
|
11
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
12
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
13
|
+
|
|
14
|
+
STEP_ID = "upload_stage"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class UploadStageStepResponse(TypedDict):
|
|
18
|
+
file_data_path: str
|
|
19
|
+
path: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class UploadStageStep(PipelineStep):
|
|
24
|
+
process: UploadStager
|
|
25
|
+
identifier: str = STEP_ID
|
|
26
|
+
|
|
27
|
+
def __str__(self):
|
|
28
|
+
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
29
|
+
|
|
30
|
+
def __post_init__(self):
|
|
31
|
+
config = (
|
|
32
|
+
sterilize_dict(self.process.upload_stager_config.to_dict(redact_sensitive=True))
|
|
33
|
+
if self.process.upload_stager_config
|
|
34
|
+
else None
|
|
35
|
+
)
|
|
36
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
38
|
+
|
|
39
|
+
async def _run_async(
|
|
40
|
+
self, fn: Callable, path: str, file_data_path: str
|
|
41
|
+
) -> UploadStageStepResponse:
|
|
42
|
+
path = Path(path)
|
|
43
|
+
fn_kwargs = {
|
|
44
|
+
"elements_filepath": path,
|
|
45
|
+
"file_data": FileData.from_file(path=file_data_path),
|
|
46
|
+
"output_dir": self.cache_dir,
|
|
47
|
+
"output_filename": self.get_hash(extras=[path.name]),
|
|
48
|
+
}
|
|
49
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
50
|
+
staged_output_path = fn(**fn_kwargs)
|
|
51
|
+
elif semaphore := self.context.semaphore:
|
|
52
|
+
async with semaphore:
|
|
53
|
+
staged_output_path = await fn(**fn_kwargs)
|
|
54
|
+
else:
|
|
55
|
+
staged_output_path = await fn(**fn_kwargs)
|
|
56
|
+
return UploadStageStepResponse(file_data_path=file_data_path, path=str(staged_output_path))
|
|
57
|
+
|
|
58
|
+
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
59
|
+
hashable_string = json.dumps(
|
|
60
|
+
self.process.upload_stager_config.to_dict(), sort_keys=True, ensure_ascii=True
|
|
61
|
+
)
|
|
62
|
+
if extras:
|
|
63
|
+
hashable_string += "".join(extras)
|
|
64
|
+
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Callable, TypedDict
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
6
|
+
from unstructured_ingest.v2.logger import logger
|
|
7
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
8
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
9
|
+
from unstructured_ingest.v2.processes.uncompress import Uncompressor
|
|
10
|
+
|
|
11
|
+
STEP_ID = "uncompress"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UncompressStepResponse(TypedDict):
|
|
15
|
+
file_data_path: str
|
|
16
|
+
path: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class UncompressStep(PipelineStep):
|
|
20
|
+
process: Uncompressor
|
|
21
|
+
identifier: str = STEP_ID
|
|
22
|
+
|
|
23
|
+
def __post_init__(self):
|
|
24
|
+
config = (
|
|
25
|
+
sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
|
|
26
|
+
if self.process.config
|
|
27
|
+
else None
|
|
28
|
+
)
|
|
29
|
+
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
30
|
+
|
|
31
|
+
def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]:
|
|
32
|
+
file_data = FileData.from_file(path=file_data_path)
|
|
33
|
+
new_file_data = self.process.run(file_data=file_data)
|
|
34
|
+
responses = []
|
|
35
|
+
for new_file in new_file_data:
|
|
36
|
+
new_file_data_path = Path(file_data_path).parent / f"{new_file.identifier}.json"
|
|
37
|
+
new_file.to_file(path=str(new_file_data_path.resolve()))
|
|
38
|
+
responses.append(
|
|
39
|
+
UncompressStepResponse(
|
|
40
|
+
path=new_file.source_identifiers.fullpath,
|
|
41
|
+
file_data_path=str(new_file_data_path),
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
return responses
|
|
45
|
+
|
|
46
|
+
async def _run_async(
|
|
47
|
+
self, fn: Callable, path: str, file_data_path: str
|
|
48
|
+
) -> list[UncompressStepResponse]:
|
|
49
|
+
file_data = FileData.from_file(path=file_data_path)
|
|
50
|
+
fn_kwargs = {"file_data": file_data}
|
|
51
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
52
|
+
new_file_data = fn(**fn_kwargs)
|
|
53
|
+
elif semaphore := self.context.semaphore:
|
|
54
|
+
async with semaphore:
|
|
55
|
+
new_file_data = await fn(**fn_kwargs)
|
|
56
|
+
else:
|
|
57
|
+
new_file_data = await fn(**fn_kwargs)
|
|
58
|
+
responses = []
|
|
59
|
+
for new_file in new_file_data:
|
|
60
|
+
new_file_data_path = Path(file_data_path).parent / f"{new_file.identifier}.json"
|
|
61
|
+
new_file.to_file(path=str(new_file_data_path.resolve()))
|
|
62
|
+
responses.append(
|
|
63
|
+
UncompressStepResponse(
|
|
64
|
+
path=new_file.source_identifiers.fullpath,
|
|
65
|
+
file_data_path=str(new_file_data_path),
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
return responses
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable, Optional, TypedDict
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
7
|
+
from unstructured_ingest.v2.interfaces.uploader import UploadContent, Uploader
|
|
8
|
+
from unstructured_ingest.v2.logger import logger
|
|
9
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep, iterable_input, timed
|
|
10
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
11
|
+
|
|
12
|
+
STEP_ID = "upload"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UploadStepContent(TypedDict):
|
|
16
|
+
path: str
|
|
17
|
+
file_data_path: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class UploadStep(PipelineStep):
|
|
22
|
+
process: Uploader
|
|
23
|
+
identifier: str = STEP_ID
|
|
24
|
+
|
|
25
|
+
def __str__(self):
|
|
26
|
+
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
27
|
+
|
|
28
|
+
def __post_init__(self):
|
|
29
|
+
config = (
|
|
30
|
+
sterilize_dict(self.process.upload_config.to_dict(redact_sensitive=True))
|
|
31
|
+
if self.process.upload_config
|
|
32
|
+
else None
|
|
33
|
+
)
|
|
34
|
+
connection_config = (
|
|
35
|
+
sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
|
|
36
|
+
if self.process.connection_config
|
|
37
|
+
else None
|
|
38
|
+
)
|
|
39
|
+
logger.info(
|
|
40
|
+
f"Created {self.identifier} with configs: {config}, "
|
|
41
|
+
f"connection configs: {connection_config}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def process_whole(self, iterable: iterable_input):
|
|
45
|
+
self.run(contents=iterable)
|
|
46
|
+
|
|
47
|
+
@timed
|
|
48
|
+
def __call__(self, iterable: iterable_input):
|
|
49
|
+
logger.info(
|
|
50
|
+
f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
51
|
+
)
|
|
52
|
+
if self.process.is_async():
|
|
53
|
+
self.process_async(iterable=iterable)
|
|
54
|
+
else:
|
|
55
|
+
self.process_whole(iterable=iterable)
|
|
56
|
+
|
|
57
|
+
def _run(self, fn: Callable, contents: list[UploadStepContent]):
|
|
58
|
+
upload_contents = [
|
|
59
|
+
UploadContent(path=Path(c["path"]), file_data=FileData.from_file(c["file_data_path"]))
|
|
60
|
+
for c in contents
|
|
61
|
+
]
|
|
62
|
+
fn(contents=upload_contents)
|
|
63
|
+
|
|
64
|
+
async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
|
|
65
|
+
fn = fn or self.process.run_async
|
|
66
|
+
fn_kwargs = {"path": Path(path), "file_data": FileData.from_file(path=file_data_path)}
|
|
67
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
68
|
+
fn(**fn_kwargs)
|
|
69
|
+
elif semaphore := self.context.semaphore:
|
|
70
|
+
async with semaphore:
|
|
71
|
+
await fn(**fn_kwargs)
|
|
72
|
+
else:
|
|
73
|
+
await fn(**fn_kwargs)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def sterilize_dict(data: dict) -> dict:
|
|
7
|
+
def json_serial(obj):
|
|
8
|
+
if isinstance(obj, Path):
|
|
9
|
+
return obj.as_posix()
|
|
10
|
+
if isinstance(obj, datetime):
|
|
11
|
+
return obj.isoformat()
|
|
12
|
+
raise TypeError("Type %s not serializable" % type(obj))
|
|
13
|
+
|
|
14
|
+
data_s = json.dumps(data, default=json_serial)
|
|
15
|
+
return json.loads(data_s)
|
|
File without changes
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass, fields
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from unstructured.chunking import dispatch
|
|
7
|
+
from unstructured.documents.elements import Element, assign_and_map_hash_ids
|
|
8
|
+
from unstructured.staging.base import dict_to_elements, elements_from_json
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
11
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
|
+
from unstructured_ingest.v2.logger import logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ChunkerConfig(EnhancedDataClassJsonMixin):
|
|
17
|
+
chunking_strategy: Optional[str] = None
|
|
18
|
+
chunking_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
|
|
19
|
+
chunk_by_api: bool = False
|
|
20
|
+
chunk_api_key: Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
21
|
+
|
|
22
|
+
chunk_combine_text_under_n_chars: Optional[int] = None
|
|
23
|
+
chunk_include_orig_elements: Optional[bool] = None
|
|
24
|
+
chunk_max_characters: Optional[int] = None
|
|
25
|
+
chunk_multipage_sections: Optional[bool] = None
|
|
26
|
+
chunk_new_after_n_chars: Optional[int] = None
|
|
27
|
+
chunk_overlap: Optional[int] = None
|
|
28
|
+
chunk_overlap_all: Optional[bool] = None
|
|
29
|
+
|
|
30
|
+
def to_chunking_kwargs(self) -> dict[str, Any]:
|
|
31
|
+
return {
|
|
32
|
+
"chunking_strategy": self.chunking_strategy,
|
|
33
|
+
"combine_under_n_chars": self.chunk_combine_text_under_n_chars,
|
|
34
|
+
"max_characters": self.chunk_max_characters,
|
|
35
|
+
"include_orig_elements": self.chunk_include_orig_elements,
|
|
36
|
+
"multipage_sections": self.chunk_multipage_sections,
|
|
37
|
+
"new_after_n_chars": self.chunk_new_after_n_chars,
|
|
38
|
+
"overlap": self.chunk_overlap,
|
|
39
|
+
"overlap_all": self.chunk_overlap_all,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Chunker(BaseProcess, ABC):
|
|
45
|
+
config: ChunkerConfig
|
|
46
|
+
|
|
47
|
+
def is_async(self) -> bool:
|
|
48
|
+
return self.config.chunk_by_api
|
|
49
|
+
|
|
50
|
+
def run(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
|
|
51
|
+
elements = elements_from_json(filename=str(elements_filepath))
|
|
52
|
+
if not elements:
|
|
53
|
+
return elements
|
|
54
|
+
local_chunking_strategies = ("basic", "by_title")
|
|
55
|
+
if self.config.chunking_strategy not in local_chunking_strategies:
|
|
56
|
+
logger.warning(
|
|
57
|
+
"chunking strategy not supported for local chunking: {}, must be one of: {}".format(
|
|
58
|
+
self.config.chunking_strategy, ", ".join(local_chunking_strategies)
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
return elements
|
|
62
|
+
chunked_elements = dispatch.chunk(elements=elements, **self.config.to_chunking_kwargs())
|
|
63
|
+
assign_and_map_hash_ids(chunked_elements)
|
|
64
|
+
return chunked_elements
|
|
65
|
+
|
|
66
|
+
async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
|
|
67
|
+
from unstructured_client import UnstructuredClient
|
|
68
|
+
from unstructured_client.models.shared import Files, PartitionParameters
|
|
69
|
+
|
|
70
|
+
client = UnstructuredClient(
|
|
71
|
+
api_key_auth=self.config.chunk_api_key,
|
|
72
|
+
server_url=self.config.chunking_endpoint,
|
|
73
|
+
)
|
|
74
|
+
partition_request = self.config.to_chunking_kwargs()
|
|
75
|
+
possible_fields = [f.name for f in fields(PartitionParameters)]
|
|
76
|
+
filtered_partition_request = {
|
|
77
|
+
k: v for k, v in partition_request.items() if k in possible_fields
|
|
78
|
+
}
|
|
79
|
+
if len(filtered_partition_request) != len(partition_request):
|
|
80
|
+
logger.debug(
|
|
81
|
+
"Following fields were omitted due to not being "
|
|
82
|
+
"supported by the currently used unstructured client: {}".format(
|
|
83
|
+
", ".join([v for v in partition_request if v not in filtered_partition_request])
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
with open(elements_filepath, "rb") as f:
|
|
87
|
+
files = Files(
|
|
88
|
+
content=f.read(),
|
|
89
|
+
file_name=str(elements_filepath.resolve()),
|
|
90
|
+
)
|
|
91
|
+
filtered_partition_request["files"] = files
|
|
92
|
+
partition_params = PartitionParameters(**filtered_partition_request)
|
|
93
|
+
resp = client.general.partition(partition_params)
|
|
94
|
+
elements_raw = resp.elements or []
|
|
95
|
+
elements = dict_to_elements(elements_raw)
|
|
96
|
+
assign_and_map_hash_ids(elements)
|
|
97
|
+
return elements
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional, Type, TypeVar
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.v2.interfaces import (
|
|
5
|
+
ConnectionConfig,
|
|
6
|
+
Downloader,
|
|
7
|
+
DownloaderConfig,
|
|
8
|
+
Indexer,
|
|
9
|
+
IndexerConfig,
|
|
10
|
+
Uploader,
|
|
11
|
+
UploaderConfig,
|
|
12
|
+
UploadStager,
|
|
13
|
+
UploadStagerConfig,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
IndexerT = TypeVar("IndexerT", bound=Indexer)
|
|
17
|
+
IndexerConfigT = TypeVar("IndexerConfigT", bound=IndexerConfig)
|
|
18
|
+
DownloaderT = TypeVar("DownloaderT", bound=Downloader)
|
|
19
|
+
DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
|
|
20
|
+
ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
|
|
21
|
+
UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
22
|
+
UploadStagerT = TypeVar("UploadStagerT", bound=UploadStager)
|
|
23
|
+
UploaderConfigT = TypeVar("UploaderConfigT", bound=UploaderConfig)
|
|
24
|
+
UploaderT = TypeVar("UploaderT", bound=Uploader)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class SourceRegistryEntry:
|
|
29
|
+
indexer: Type[IndexerT]
|
|
30
|
+
downloader: Type[DownloaderT]
|
|
31
|
+
|
|
32
|
+
downloader_config: Optional[Type[DownloaderConfigT]] = None
|
|
33
|
+
indexer_config: Optional[Type[IndexerConfigT]] = None
|
|
34
|
+
connection_config: Optional[Type[ConnectionConfigT]] = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
source_registry: dict[str, SourceRegistryEntry] = {}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def add_source_entry(source_type: str, entry: SourceRegistryEntry):
|
|
41
|
+
if source_type in source_registry:
|
|
42
|
+
raise ValueError(f"source {source_type} has already been registered")
|
|
43
|
+
source_registry[source_type] = entry
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class DestinationRegistryEntry:
|
|
48
|
+
uploader: Type[UploaderT]
|
|
49
|
+
upload_stager: Optional[Type[UploadStagerT]] = None
|
|
50
|
+
|
|
51
|
+
upload_stager_config: Optional[Type[UploadStagerConfigT]] = None
|
|
52
|
+
uploader_config: Optional[Type[UploaderConfigT]] = None
|
|
53
|
+
|
|
54
|
+
connection_config: Optional[Type[ConnectionConfigT]] = None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
destination_registry: dict[str, DestinationRegistryEntry] = {}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def add_destination_entry(destination_type: str, entry: DestinationRegistryEntry):
|
|
61
|
+
if destination_type in destination_registry:
|
|
62
|
+
raise ValueError(f"destination {destination_type} has already been registered")
|
|
63
|
+
destination_registry[destination_type] = entry
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import unstructured.ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
6
|
+
add_destination_entry,
|
|
7
|
+
add_source_entry,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE
|
|
11
|
+
from .astra import astra_destination_entry
|
|
12
|
+
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
|
13
|
+
from .chroma import chroma_destination_entry
|
|
14
|
+
from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
|
|
15
|
+
from .databricks_volumes import databricks_volumes_destination_entry
|
|
16
|
+
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
17
|
+
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
18
|
+
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
19
|
+
from .google_drive import google_drive_source_entry
|
|
20
|
+
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
|
|
21
|
+
from .local import local_destination_entry, local_source_entry
|
|
22
|
+
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
23
|
+
from .mongodb import mongodb_destination_entry
|
|
24
|
+
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
25
|
+
from .onedrive import onedrive_source_entry
|
|
26
|
+
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
|
27
|
+
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
|
28
|
+
from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
|
|
29
|
+
from .pinecone import pinecone_destination_entry
|
|
30
|
+
from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
|
|
31
|
+
from .salesforce import salesforce_source_entry
|
|
32
|
+
from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
|
|
33
|
+
from .sharepoint import sharepoint_source_entry
|
|
34
|
+
from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
|
|
35
|
+
from .singlestore import singlestore_destination_entry
|
|
36
|
+
from .sql import CONNECTOR_TYPE as SQL_CONNECTOR_TYPE
|
|
37
|
+
from .sql import sql_destination_entry
|
|
38
|
+
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
|
39
|
+
from .weaviate import weaviate_destination_entry
|
|
40
|
+
|
|
41
|
+
add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry)
|
|
42
|
+
|
|
43
|
+
add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
|
|
44
|
+
|
|
45
|
+
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
46
|
+
add_destination_entry(
|
|
47
|
+
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)
|
|
51
|
+
|
|
52
|
+
add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
|
|
53
|
+
add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)
|
|
54
|
+
|
|
55
|
+
add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
|
|
56
|
+
|
|
57
|
+
add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
|
|
58
|
+
add_destination_entry(
|
|
59
|
+
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
|
|
63
|
+
|
|
64
|
+
add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
|
|
65
|
+
|
|
66
|
+
add_destination_entry(
|
|
67
|
+
destination_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_destination_entry
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
add_destination_entry(destination_type=SQL_CONNECTOR_TYPE, entry=sql_destination_entry)
|
|
71
|
+
|
|
72
|
+
add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
|
|
73
|
+
add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry)
|
|
74
|
+
add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry)
|
|
75
|
+
add_destination_entry(
|
|
76
|
+
destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
|
|
77
|
+
)
|