unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import os
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
6
|
+
|
|
7
|
+
from dateutil import parser
|
|
8
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
9
|
+
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
12
|
+
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
15
|
+
from unstructured_ingest.v2.interfaces import (
|
|
16
|
+
AccessConfig,
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
Downloader,
|
|
19
|
+
DownloaderConfig,
|
|
20
|
+
FileData,
|
|
21
|
+
Indexer,
|
|
22
|
+
IndexerConfig,
|
|
23
|
+
SourceIdentifiers,
|
|
24
|
+
download_responses,
|
|
25
|
+
)
|
|
26
|
+
from unstructured_ingest.v2.logger import logger
|
|
27
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
28
|
+
SourceRegistryEntry,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
CONNECTOR_TYPE = "google_drive"
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from googleapiclient.discovery import Resource as GoogleAPIResource
|
|
35
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class GoogleDriveAccessConfig(AccessConfig):
|
|
40
|
+
service_account_key: Union[str, dict]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class GoogleDriveConnectionConfig(ConnectionConfig):
|
|
45
|
+
drive_id: str
|
|
46
|
+
access_config: GoogleDriveAccessConfig = enhanced_field(sensitive=True)
|
|
47
|
+
|
|
48
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
49
|
+
def get_files_service(self) -> "GoogleAPIResource":
|
|
50
|
+
from google.auth import default, exceptions
|
|
51
|
+
from google.oauth2 import service_account
|
|
52
|
+
from googleapiclient.discovery import build
|
|
53
|
+
from googleapiclient.errors import HttpError
|
|
54
|
+
|
|
55
|
+
# Service account key can be a dict or a file path(str)
|
|
56
|
+
# But the dict may come in as a string
|
|
57
|
+
if isinstance(self.access_config.service_account_key, str):
|
|
58
|
+
key_path = json_to_dict(self.access_config.service_account_key)
|
|
59
|
+
elif isinstance(self.access_config.service_account_key, dict):
|
|
60
|
+
key_path = self.access_config.service_account_key
|
|
61
|
+
else:
|
|
62
|
+
raise TypeError(
|
|
63
|
+
f"access_config.service_account_key must be "
|
|
64
|
+
f"str or dict, got: {type(self.access_config.service_account_key)}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
if isinstance(key_path, dict):
|
|
69
|
+
creds = service_account.Credentials.from_service_account_info(key_path)
|
|
70
|
+
elif isinstance(key_path, str):
|
|
71
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
|
|
72
|
+
creds, _ = default()
|
|
73
|
+
else:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
f"key path not recognized as a dictionary or a file path: "
|
|
76
|
+
f"[{type(key_path)}] {key_path}",
|
|
77
|
+
)
|
|
78
|
+
service = build("drive", "v3", credentials=creds)
|
|
79
|
+
return service.files()
|
|
80
|
+
|
|
81
|
+
except HttpError as exc:
|
|
82
|
+
raise ValueError(f"{exc.reason}")
|
|
83
|
+
except exceptions.DefaultCredentialsError:
|
|
84
|
+
raise ValueError("The provided API key is invalid.")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class GoogleDriveIndexerConfig(IndexerConfig):
|
|
89
|
+
extensions: Optional[list[str]] = None
|
|
90
|
+
recursive: bool = False
|
|
91
|
+
|
|
92
|
+
def __post_init__(self):
|
|
93
|
+
# Strip leading period of extension
|
|
94
|
+
if self.extensions is not None:
|
|
95
|
+
self.extensions = [e[1:] if e.startswith(".") else e for e in self.extensions]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class GoogleDriveIndexer(Indexer):
|
|
100
|
+
connection_config: GoogleDriveConnectionConfig
|
|
101
|
+
index_config: GoogleDriveIndexerConfig
|
|
102
|
+
fields: list[str] = field(
|
|
103
|
+
default_factory=lambda: [
|
|
104
|
+
"id",
|
|
105
|
+
"name",
|
|
106
|
+
"mimeType",
|
|
107
|
+
"fileExtension",
|
|
108
|
+
"md5Checksum",
|
|
109
|
+
"sha1Checksum",
|
|
110
|
+
"sha256Checksum",
|
|
111
|
+
"headRevisionId",
|
|
112
|
+
"permissions",
|
|
113
|
+
"createdTime",
|
|
114
|
+
"modifiedTime",
|
|
115
|
+
"version",
|
|
116
|
+
"originalFilename",
|
|
117
|
+
"capabilities",
|
|
118
|
+
"permissionIds",
|
|
119
|
+
"webViewLink",
|
|
120
|
+
"webContentLink",
|
|
121
|
+
]
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def is_dir(record: dict) -> bool:
|
|
126
|
+
return record.get("mimeType") == "application/vnd.google-apps.folder"
|
|
127
|
+
|
|
128
|
+
@staticmethod
|
|
129
|
+
def map_file_data(f: dict) -> FileData:
|
|
130
|
+
file_id = f["id"]
|
|
131
|
+
filename = f.pop("name")
|
|
132
|
+
url = f.pop("webContentLink", None)
|
|
133
|
+
version = f.pop("version", None)
|
|
134
|
+
permissions = f.pop("permissions", None)
|
|
135
|
+
date_created_str = f.pop("createdTime", None)
|
|
136
|
+
date_created_dt = parser.parse(date_created_str) if date_created_str else None
|
|
137
|
+
date_modified_str = f.pop("modifiedTime", None)
|
|
138
|
+
parent_path = f.pop("parent_path", None)
|
|
139
|
+
parent_root_path = f.pop("parent_root_path", None)
|
|
140
|
+
date_modified_dt = parser.parse(date_modified_str) if date_modified_str else None
|
|
141
|
+
if (
|
|
142
|
+
parent_path
|
|
143
|
+
and isinstance(parent_path, str)
|
|
144
|
+
and parent_root_path
|
|
145
|
+
and isinstance(parent_root_path, str)
|
|
146
|
+
):
|
|
147
|
+
fullpath = f"{parent_path}/{filename}"
|
|
148
|
+
rel_path = fullpath.replace(parent_root_path, "")
|
|
149
|
+
source_identifiers = SourceIdentifiers(
|
|
150
|
+
filename=filename, fullpath=fullpath, rel_path=rel_path
|
|
151
|
+
)
|
|
152
|
+
else:
|
|
153
|
+
source_identifiers = SourceIdentifiers(fullpath=filename, filename=filename)
|
|
154
|
+
return FileData(
|
|
155
|
+
connector_type=CONNECTOR_TYPE,
|
|
156
|
+
identifier=file_id,
|
|
157
|
+
source_identifiers=source_identifiers,
|
|
158
|
+
metadata=DataSourceMetadata(
|
|
159
|
+
url=url,
|
|
160
|
+
version=version,
|
|
161
|
+
date_created=str(date_created_dt.timestamp()),
|
|
162
|
+
date_modified=str(date_modified_dt.timestamp()),
|
|
163
|
+
permissions_data=permissions,
|
|
164
|
+
record_locator={
|
|
165
|
+
"file_id": file_id,
|
|
166
|
+
},
|
|
167
|
+
),
|
|
168
|
+
additional_metadata=f,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def get_paginated_results(
|
|
172
|
+
self,
|
|
173
|
+
files_client,
|
|
174
|
+
object_id: str,
|
|
175
|
+
extensions: Optional[list[str]] = None,
|
|
176
|
+
recursive: bool = False,
|
|
177
|
+
previous_path: Optional[str] = None,
|
|
178
|
+
) -> list[dict]:
|
|
179
|
+
|
|
180
|
+
fields_input = "nextPageToken, files({})".format(",".join(self.fields))
|
|
181
|
+
q = f"'{object_id}' in parents"
|
|
182
|
+
# Filter by extension but still include any directories
|
|
183
|
+
if extensions:
|
|
184
|
+
ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
|
|
185
|
+
q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
|
|
186
|
+
logger.debug(f"Query used when indexing: {q}")
|
|
187
|
+
logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
|
|
188
|
+
done = False
|
|
189
|
+
page_token = None
|
|
190
|
+
files_response = []
|
|
191
|
+
while not done:
|
|
192
|
+
response: dict = files_client.list(
|
|
193
|
+
spaces="drive",
|
|
194
|
+
fields=fields_input,
|
|
195
|
+
corpora="user",
|
|
196
|
+
pageToken=page_token,
|
|
197
|
+
q=q,
|
|
198
|
+
).execute()
|
|
199
|
+
if files := response.get("files", []):
|
|
200
|
+
fs = [f for f in files if not self.is_dir(record=f)]
|
|
201
|
+
for r in fs:
|
|
202
|
+
r["parent_path"] = previous_path
|
|
203
|
+
dirs = [f for f in files if self.is_dir(record=f)]
|
|
204
|
+
files_response.extend(fs)
|
|
205
|
+
if recursive:
|
|
206
|
+
for d in dirs:
|
|
207
|
+
dir_id = d["id"]
|
|
208
|
+
dir_name = d["name"]
|
|
209
|
+
files_response.extend(
|
|
210
|
+
self.get_paginated_results(
|
|
211
|
+
files_client=files_client,
|
|
212
|
+
object_id=dir_id,
|
|
213
|
+
extensions=extensions,
|
|
214
|
+
recursive=recursive,
|
|
215
|
+
previous_path=f"{previous_path}/{dir_name}",
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
page_token = response.get("nextPageToken")
|
|
219
|
+
if page_token is None:
|
|
220
|
+
done = True
|
|
221
|
+
for r in files_response:
|
|
222
|
+
r["parent_root_path"] = previous_path
|
|
223
|
+
return files_response
|
|
224
|
+
|
|
225
|
+
def get_root_info(self, files_client, object_id: str) -> dict:
|
|
226
|
+
return files_client.get(fileId=object_id, fields=",".join(self.fields)).execute()
|
|
227
|
+
|
|
228
|
+
def get_files(
|
|
229
|
+
self,
|
|
230
|
+
files_client,
|
|
231
|
+
object_id: str,
|
|
232
|
+
recursive: bool = False,
|
|
233
|
+
extensions: Optional[list[str]] = None,
|
|
234
|
+
) -> list[FileData]:
|
|
235
|
+
root_info = self.get_root_info(files_client=files_client, object_id=object_id)
|
|
236
|
+
if not self.is_dir(root_info):
|
|
237
|
+
data = [self.map_file_data(root_info)]
|
|
238
|
+
else:
|
|
239
|
+
|
|
240
|
+
file_contents = self.get_paginated_results(
|
|
241
|
+
files_client=files_client,
|
|
242
|
+
object_id=object_id,
|
|
243
|
+
extensions=extensions,
|
|
244
|
+
recursive=recursive,
|
|
245
|
+
previous_path=root_info["name"],
|
|
246
|
+
)
|
|
247
|
+
data = [self.map_file_data(f=f) for f in file_contents]
|
|
248
|
+
for d in data:
|
|
249
|
+
d.metadata.record_locator["drive_id"]: object_id
|
|
250
|
+
return data
|
|
251
|
+
|
|
252
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
253
|
+
for f in self.get_files(
|
|
254
|
+
files_client=self.connection_config.get_files_service(),
|
|
255
|
+
object_id=self.connection_config.drive_id,
|
|
256
|
+
recursive=self.index_config.recursive,
|
|
257
|
+
extensions=self.index_config.extensions,
|
|
258
|
+
):
|
|
259
|
+
yield f
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@dataclass
|
|
263
|
+
class GoogleDriveDownloaderConfig(DownloaderConfig):
|
|
264
|
+
pass
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
@dataclass
|
|
268
|
+
class GoogleDriveDownloader(Downloader):
|
|
269
|
+
connection_config: GoogleDriveConnectionConfig
|
|
270
|
+
download_config: GoogleDriveDownloaderConfig = field(
|
|
271
|
+
default_factory=lambda: GoogleDriveDownloaderConfig()
|
|
272
|
+
)
|
|
273
|
+
connector_type: str = CONNECTOR_TYPE
|
|
274
|
+
|
|
275
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
276
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
277
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
278
|
+
return self.download_dir / Path(rel_path)
|
|
279
|
+
|
|
280
|
+
@SourceConnectionNetworkError.wrap
|
|
281
|
+
def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
|
|
282
|
+
downloaded = False
|
|
283
|
+
while downloaded is False:
|
|
284
|
+
_, downloaded = downloader.next_chunk()
|
|
285
|
+
return downloaded
|
|
286
|
+
|
|
287
|
+
def _write_file(self, file_data: FileData, file_contents: io.BytesIO):
|
|
288
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
289
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
290
|
+
logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
|
|
291
|
+
with open(download_path, "wb") as handler:
|
|
292
|
+
handler.write(file_contents.getbuffer())
|
|
293
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
294
|
+
|
|
295
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
296
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
297
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
298
|
+
|
|
299
|
+
logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
|
|
300
|
+
mime_type = file_data.additional_metadata["mimeType"]
|
|
301
|
+
record_id = file_data.identifier
|
|
302
|
+
files_client = self.connection_config.get_files_service()
|
|
303
|
+
if mime_type.startswith("application/vnd.google-apps"):
|
|
304
|
+
export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
|
|
305
|
+
self.meta.get("mimeType"), # type: ignore
|
|
306
|
+
)
|
|
307
|
+
if not export_mime:
|
|
308
|
+
raise TypeError(
|
|
309
|
+
f"File not supported. Name: {file_data.source_identifiers.filename} "
|
|
310
|
+
f"ID: {record_id} "
|
|
311
|
+
f"MimeType: {mime_type}"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
request = files_client.export_media(
|
|
315
|
+
fileId=record_id,
|
|
316
|
+
mimeType=export_mime,
|
|
317
|
+
)
|
|
318
|
+
else:
|
|
319
|
+
request = files_client.get_media(fileId=record_id)
|
|
320
|
+
|
|
321
|
+
file_contents = io.BytesIO()
|
|
322
|
+
downloader = MediaIoBaseDownload(file_contents, request)
|
|
323
|
+
downloaded = self._get_content(downloader=downloader)
|
|
324
|
+
if not downloaded or not file_contents:
|
|
325
|
+
return []
|
|
326
|
+
return self._write_file(file_data=file_data, file_contents=file_contents)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
google_drive_source_entry = SourceRegistryEntry(
|
|
330
|
+
connection_config=GoogleDriveConnectionConfig,
|
|
331
|
+
indexer_config=GoogleDriveIndexerConfig,
|
|
332
|
+
indexer=GoogleDriveIndexer,
|
|
333
|
+
downloader_config=GoogleDriveDownloaderConfig,
|
|
334
|
+
downloader=GoogleDriveDownloader,
|
|
335
|
+
)
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import itertools
|
|
3
|
+
import shutil
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from time import time
|
|
7
|
+
from typing import Any, Generator, Optional
|
|
8
|
+
|
|
9
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
ConnectionConfig,
|
|
14
|
+
Downloader,
|
|
15
|
+
DownloaderConfig,
|
|
16
|
+
DownloadResponse,
|
|
17
|
+
FileData,
|
|
18
|
+
Indexer,
|
|
19
|
+
IndexerConfig,
|
|
20
|
+
SourceIdentifiers,
|
|
21
|
+
UploadContent,
|
|
22
|
+
Uploader,
|
|
23
|
+
UploaderConfig,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.logger import logger
|
|
26
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
27
|
+
DestinationRegistryEntry,
|
|
28
|
+
SourceRegistryEntry,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
CONNECTOR_TYPE = "local"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class LocalAccessConfig(AccessConfig):
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class LocalConnectionConfig(ConnectionConfig):
|
|
41
|
+
access_config: LocalAccessConfig = field(default_factory=lambda: LocalAccessConfig())
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class LocalIndexerConfig(IndexerConfig):
|
|
46
|
+
input_path: str
|
|
47
|
+
recursive: bool = False
|
|
48
|
+
file_glob: Optional[list[str]] = None
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def path(self) -> Path:
|
|
52
|
+
return Path(self.input_path).resolve()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class LocalIndexer(Indexer):
|
|
57
|
+
index_config: LocalIndexerConfig
|
|
58
|
+
connection_config: LocalConnectionConfig = field(
|
|
59
|
+
default_factory=lambda: LocalConnectionConfig()
|
|
60
|
+
)
|
|
61
|
+
connector_type: str = CONNECTOR_TYPE
|
|
62
|
+
|
|
63
|
+
def list_files(self) -> list[Path]:
|
|
64
|
+
input_path = self.index_config.path
|
|
65
|
+
if input_path.is_file():
|
|
66
|
+
return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
|
|
67
|
+
glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob
|
|
68
|
+
if not self.index_config.file_glob:
|
|
69
|
+
return list(glob_fn("*"))
|
|
70
|
+
return list(
|
|
71
|
+
itertools.chain.from_iterable(
|
|
72
|
+
glob_fn(pattern) for pattern in self.index_config.file_glob
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def get_file_metadata(self, path: Path) -> DataSourceMetadata:
|
|
77
|
+
stats = path.stat()
|
|
78
|
+
try:
|
|
79
|
+
date_modified = str(stats.st_mtime)
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.warning(f"Couldn't detect date modified: {e}")
|
|
82
|
+
date_modified = None
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
date_created = str(stats.st_birthtime)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.warning(f"Couldn't detect date created: {e}")
|
|
88
|
+
date_created = None
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
mode = stats.st_mode
|
|
92
|
+
permissions_data = [{"mode": mode}]
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.warning(f"Couldn't detect file mode: {e}")
|
|
95
|
+
permissions_data = None
|
|
96
|
+
return DataSourceMetadata(
|
|
97
|
+
date_modified=date_modified,
|
|
98
|
+
date_created=date_created,
|
|
99
|
+
date_processed=str(time()),
|
|
100
|
+
permissions_data=permissions_data,
|
|
101
|
+
record_locator={"path": str(path.resolve())},
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
105
|
+
for file_path in self.list_files():
|
|
106
|
+
file_data = FileData(
|
|
107
|
+
identifier=str(file_path.resolve()),
|
|
108
|
+
connector_type=CONNECTOR_TYPE,
|
|
109
|
+
source_identifiers=SourceIdentifiers(
|
|
110
|
+
fullpath=str(file_path.resolve()),
|
|
111
|
+
filename=file_path.name,
|
|
112
|
+
rel_path=(
|
|
113
|
+
str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[
|
|
114
|
+
1:
|
|
115
|
+
]
|
|
116
|
+
if not self.index_config.path.is_file()
|
|
117
|
+
else self.index_config.path.name
|
|
118
|
+
),
|
|
119
|
+
),
|
|
120
|
+
metadata=self.get_file_metadata(path=file_path),
|
|
121
|
+
)
|
|
122
|
+
yield file_data
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclass
|
|
126
|
+
class LocalDownloaderConfig(DownloaderConfig):
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass
|
|
131
|
+
class LocalDownloader(Downloader):
|
|
132
|
+
connector_type: str = CONNECTOR_TYPE
|
|
133
|
+
connection_config: LocalConnectionConfig = field(
|
|
134
|
+
default_factory=lambda: LocalConnectionConfig()
|
|
135
|
+
)
|
|
136
|
+
download_config: LocalDownloaderConfig = field(default_factory=lambda: LocalDownloaderConfig())
|
|
137
|
+
|
|
138
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
139
|
+
return Path(file_data.source_identifiers.fullpath)
|
|
140
|
+
|
|
141
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
142
|
+
return DownloadResponse(
|
|
143
|
+
file_data=file_data, path=Path(file_data.source_identifiers.fullpath)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@dataclass
|
|
148
|
+
class LocalUploaderConfig(UploaderConfig):
|
|
149
|
+
output_dir: str = field(default="structured-output")
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def output_path(self) -> Path:
|
|
153
|
+
return Path(self.output_dir).resolve()
|
|
154
|
+
|
|
155
|
+
def __post_init__(self):
|
|
156
|
+
if self.output_path.exists() and self.output_path.is_file():
|
|
157
|
+
raise ValueError("output path already exists as a file")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class LocalUploader(Uploader):
|
|
162
|
+
connector_type: str = CONNECTOR_TYPE
|
|
163
|
+
upload_config: LocalUploaderConfig = field(default_factory=lambda: LocalUploaderConfig())
|
|
164
|
+
connection_config: LocalConnectionConfig = field(
|
|
165
|
+
default_factory=lambda: LocalConnectionConfig()
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def is_async(self) -> bool:
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
172
|
+
self.upload_config.output_path.mkdir(parents=True, exist_ok=True)
|
|
173
|
+
for content in contents:
|
|
174
|
+
if source_identifiers := content.file_data.source_identifiers:
|
|
175
|
+
identifiers = source_identifiers
|
|
176
|
+
rel_path = (
|
|
177
|
+
identifiers.relative_path[1:]
|
|
178
|
+
if identifiers.relative_path.startswith("/")
|
|
179
|
+
else identifiers.relative_path
|
|
180
|
+
)
|
|
181
|
+
new_path = self.upload_config.output_path / Path(rel_path)
|
|
182
|
+
final_path = str(new_path).replace(
|
|
183
|
+
identifiers.filename, f"{identifiers.filename}.json"
|
|
184
|
+
)
|
|
185
|
+
else:
|
|
186
|
+
final_path = self.upload_config.output_path / Path(
|
|
187
|
+
f"{content.file_data.identifier}.json"
|
|
188
|
+
)
|
|
189
|
+
Path(final_path).parent.mkdir(parents=True, exist_ok=True)
|
|
190
|
+
logger.debug(f"copying file from {content.path} to {final_path}")
|
|
191
|
+
shutil.copy(src=str(content.path), dst=str(final_path))
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
local_source_entry = SourceRegistryEntry(
|
|
195
|
+
indexer=LocalIndexer,
|
|
196
|
+
indexer_config=LocalIndexerConfig,
|
|
197
|
+
downloader=LocalDownloader,
|
|
198
|
+
downloader_config=LocalDownloaderConfig,
|
|
199
|
+
connection_config=LocalConnectionConfig,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
local_destination_entry = DestinationRegistryEntry(
|
|
203
|
+
uploader=LocalUploader, uploader_config=LocalUploaderConfig
|
|
204
|
+
)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
5
|
+
|
|
6
|
+
from unstructured.__version__ import __version__ as unstructured_version
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
9
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
ConnectionConfig,
|
|
14
|
+
FileData,
|
|
15
|
+
UploadContent,
|
|
16
|
+
Uploader,
|
|
17
|
+
UploaderConfig,
|
|
18
|
+
UploadStager,
|
|
19
|
+
UploadStagerConfig,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
|
+
DestinationRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from pymongo import MongoClient
|
|
28
|
+
|
|
29
|
+
CONNECTOR_TYPE = "mongodb"
|
|
30
|
+
SERVER_API_VERSION = "1"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class MongoDBAccessConfig(AccessConfig):
|
|
35
|
+
uri: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class MongoDBConnectionConfig(ConnectionConfig):
|
|
40
|
+
access_config: MongoDBAccessConfig = enhanced_field(
|
|
41
|
+
sensitive=True, default_factory=MongoDBAccessConfig
|
|
42
|
+
)
|
|
43
|
+
host: Optional[str] = None
|
|
44
|
+
database: Optional[str] = None
|
|
45
|
+
collection: Optional[str] = None
|
|
46
|
+
port: int = 27017
|
|
47
|
+
batch_size: int = 100
|
|
48
|
+
connector_type: str = CONNECTOR_TYPE
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class MongoDBUploadStagerConfig(UploadStagerConfig):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class MongoDBUploadStager(UploadStager):
|
|
58
|
+
upload_stager_config: MongoDBUploadStagerConfig = field(
|
|
59
|
+
default_factory=lambda: MongoDBUploadStagerConfig()
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def run(
|
|
63
|
+
self,
|
|
64
|
+
elements_filepath: Path,
|
|
65
|
+
file_data: FileData,
|
|
66
|
+
output_dir: Path,
|
|
67
|
+
output_filename: str,
|
|
68
|
+
**kwargs: Any,
|
|
69
|
+
) -> Path:
|
|
70
|
+
with open(elements_filepath) as elements_file:
|
|
71
|
+
elements_contents = json.load(elements_file)
|
|
72
|
+
|
|
73
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
74
|
+
with open(output_path, "w") as output_file:
|
|
75
|
+
json.dump(elements_contents, output_file)
|
|
76
|
+
return output_path
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class MongoDBUploaderConfig(UploaderConfig):
|
|
81
|
+
batch_size: int = 100
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class MongoDBUploader(Uploader):
|
|
86
|
+
upload_config: MongoDBUploaderConfig
|
|
87
|
+
connection_config: MongoDBConnectionConfig
|
|
88
|
+
client: Optional["MongoClient"] = field(init=False)
|
|
89
|
+
connector_type: str = CONNECTOR_TYPE
|
|
90
|
+
|
|
91
|
+
def __post_init__(self):
|
|
92
|
+
self.client = self.create_client()
|
|
93
|
+
|
|
94
|
+
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
95
|
+
def create_client(self) -> "MongoClient":
|
|
96
|
+
from pymongo import MongoClient
|
|
97
|
+
from pymongo.driver_info import DriverInfo
|
|
98
|
+
from pymongo.server_api import ServerApi
|
|
99
|
+
|
|
100
|
+
if self.connection_config.access_config.uri:
|
|
101
|
+
return MongoClient(
|
|
102
|
+
self.connection_config.access_config.uri,
|
|
103
|
+
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
104
|
+
driver=DriverInfo(name="unstructured", version=unstructured_version),
|
|
105
|
+
)
|
|
106
|
+
else:
|
|
107
|
+
return MongoClient(
|
|
108
|
+
host=self.connection_config.host,
|
|
109
|
+
port=self.connection_config.port,
|
|
110
|
+
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
114
|
+
elements_dict = []
|
|
115
|
+
for content in contents:
|
|
116
|
+
with open(content.path) as elements_file:
|
|
117
|
+
elements = json.load(elements_file)
|
|
118
|
+
elements_dict.extend(elements)
|
|
119
|
+
|
|
120
|
+
logger.info(
|
|
121
|
+
f"writing {len(elements_dict)} objects to destination "
|
|
122
|
+
f"db, {self.connection_config.database}, "
|
|
123
|
+
f"collection {self.connection_config.collection} "
|
|
124
|
+
f"at {self.connection_config.host}",
|
|
125
|
+
)
|
|
126
|
+
db = self.client[self.connection_config.database]
|
|
127
|
+
collection = db[self.connection_config.collection]
|
|
128
|
+
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
129
|
+
collection.insert_many(chunk)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
mongodb_destination_entry = DestinationRegistryEntry(
|
|
133
|
+
connection_config=MongoDBConnectionConfig,
|
|
134
|
+
uploader=MongoDBUploader,
|
|
135
|
+
uploader_config=MongoDBUploaderConfig,
|
|
136
|
+
upload_stager=MongoDBUploadStager,
|
|
137
|
+
upload_stager_config=MongoDBUploadStagerConfig,
|
|
138
|
+
)
|