unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import typing as t
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest import __name__ as integration_name
|
|
7
|
+
from unstructured_ingest.__version__ import __version__ as integration_version
|
|
8
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
9
|
+
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
10
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
11
|
+
from unstructured_ingest.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
BaseConnectorConfig,
|
|
14
|
+
BaseDestinationConnector,
|
|
15
|
+
BaseSingleIngestDoc,
|
|
16
|
+
BaseSourceConnector,
|
|
17
|
+
IngestDocCleanupMixin,
|
|
18
|
+
SourceConnectorCleanupMixin,
|
|
19
|
+
SourceMetadata,
|
|
20
|
+
WriteConfig,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.logger import logger
|
|
23
|
+
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
24
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
|
+
|
|
26
|
+
if t.TYPE_CHECKING:
|
|
27
|
+
from astrapy.db import AstraDB, AstraDBCollection
|
|
28
|
+
|
|
29
|
+
NON_INDEXED_FIELDS = ["metadata._node_content", "content"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class AstraAccessConfig(AccessConfig):
|
|
34
|
+
token: str = enhanced_field(sensitive=True)
|
|
35
|
+
api_endpoint: str = enhanced_field(sensitive=True)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class SimpleAstraConfig(BaseConnectorConfig):
|
|
40
|
+
access_config: AstraAccessConfig
|
|
41
|
+
collection_name: str
|
|
42
|
+
namespace: t.Optional[str] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
47
|
+
connector_config: SimpleAstraConfig
|
|
48
|
+
metadata: t.Dict[str, str] = field(default_factory=dict)
|
|
49
|
+
registry_name: str = "astra"
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def filename(self):
|
|
53
|
+
return (
|
|
54
|
+
Path(self.read_config.download_dir)
|
|
55
|
+
/ self.connector_config.collection_name
|
|
56
|
+
/ f"{self.metadata['_id']}.txt"
|
|
57
|
+
).resolve()
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def _output_filename(self):
|
|
61
|
+
return (
|
|
62
|
+
Path(self.processor_config.output_dir)
|
|
63
|
+
/ self.connector_config.collection_name
|
|
64
|
+
/ f"{self.metadata['_id']}.json"
|
|
65
|
+
).resolve()
|
|
66
|
+
|
|
67
|
+
def update_source_metadata(self, **kwargs):
|
|
68
|
+
if not self.metadata:
|
|
69
|
+
self.source_metadata = SourceMetadata(
|
|
70
|
+
exists=False,
|
|
71
|
+
)
|
|
72
|
+
return
|
|
73
|
+
self.source_metadata = SourceMetadata(
|
|
74
|
+
exists=True,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
@SourceConnectionError.wrap
|
|
78
|
+
@requires_dependencies(["astrapy"], extras="astra")
|
|
79
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
80
|
+
def get_file(self):
|
|
81
|
+
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
82
|
+
|
|
83
|
+
flattened_dict = flatten_dict(dictionary=self.metadata)
|
|
84
|
+
str_values = [str(value) for value in flattened_dict.values()]
|
|
85
|
+
concatenated_values = "\n".join(str_values)
|
|
86
|
+
|
|
87
|
+
with open(self.filename, "w") as f:
|
|
88
|
+
f.write(concatenated_values)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
93
|
+
connector_config: SimpleAstraConfig
|
|
94
|
+
_astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
|
|
95
|
+
_astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
@requires_dependencies(["astrapy"], extras="astra")
|
|
99
|
+
def astra_db_collection(self) -> "AstraDBCollection":
|
|
100
|
+
if self._astra_db_collection is None:
|
|
101
|
+
from astrapy.db import AstraDB
|
|
102
|
+
|
|
103
|
+
# Build the Astra DB object.
|
|
104
|
+
# caller_name/version for AstraDB tracking
|
|
105
|
+
self._astra_db = AstraDB(
|
|
106
|
+
api_endpoint=self.connector_config.access_config.api_endpoint,
|
|
107
|
+
token=self.connector_config.access_config.token,
|
|
108
|
+
namespace=self.connector_config.namespace,
|
|
109
|
+
caller_name=integration_name,
|
|
110
|
+
caller_version=integration_version,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Create and connect to the collection
|
|
114
|
+
self._astra_db_collection = self._astra_db.collection(
|
|
115
|
+
collection_name=self.connector_config.collection_name,
|
|
116
|
+
)
|
|
117
|
+
return self._astra_db_collection # type: ignore
|
|
118
|
+
|
|
119
|
+
@requires_dependencies(["astrapy"], extras="astra")
|
|
120
|
+
@SourceConnectionError.wrap # type: ignore
|
|
121
|
+
def initialize(self):
|
|
122
|
+
_ = self.astra_db_collection
|
|
123
|
+
|
|
124
|
+
@requires_dependencies(["astrapy"], extras="astra")
|
|
125
|
+
def check_connection(self):
|
|
126
|
+
try:
|
|
127
|
+
_ = self.astra_db_collection
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
130
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
131
|
+
|
|
132
|
+
@requires_dependencies(["astrapy"], extras="astra")
|
|
133
|
+
def get_ingest_docs(self): # type: ignore
|
|
134
|
+
# Perform the find operation
|
|
135
|
+
astra_docs = list(self.astra_db_collection.paginated_find())
|
|
136
|
+
|
|
137
|
+
doc_list = []
|
|
138
|
+
for record in astra_docs:
|
|
139
|
+
doc = AstraIngestDoc(
|
|
140
|
+
connector_config=self.connector_config,
|
|
141
|
+
processor_config=self.processor_config,
|
|
142
|
+
read_config=self.read_config,
|
|
143
|
+
metadata=record,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
doc.update_source_metadata()
|
|
147
|
+
|
|
148
|
+
doc_list.append(doc)
|
|
149
|
+
|
|
150
|
+
return doc_list
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@dataclass
|
|
154
|
+
class AstraWriteConfig(WriteConfig):
|
|
155
|
+
embedding_dimension: int
|
|
156
|
+
requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None
|
|
157
|
+
batch_size: int = 20
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class AstraDestinationConnector(BaseDestinationConnector):
|
|
162
|
+
write_config: AstraWriteConfig
|
|
163
|
+
connector_config: SimpleAstraConfig
|
|
164
|
+
_astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
|
|
165
|
+
_astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
|
|
166
|
+
|
|
167
|
+
def to_dict(self, **kwargs):
|
|
168
|
+
"""
|
|
169
|
+
The _astra_db_collection variable in this dataclass breaks deepcopy due to:
|
|
170
|
+
TypeError: cannot pickle '_thread.lock' object
|
|
171
|
+
When serializing, remove it, meaning client data will need to be reinitialized
|
|
172
|
+
when deserialized
|
|
173
|
+
"""
|
|
174
|
+
self_cp = copy.copy(self)
|
|
175
|
+
|
|
176
|
+
if hasattr(self_cp, "_astra_db_collection"):
|
|
177
|
+
setattr(self_cp, "_astra_db_collection", None)
|
|
178
|
+
|
|
179
|
+
return _asdict(self_cp, **kwargs)
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
@requires_dependencies(["astrapy"], extras="astra")
|
|
183
|
+
def astra_db_collection(self) -> "AstraDBCollection":
|
|
184
|
+
if self._astra_db_collection is None:
|
|
185
|
+
from astrapy.db import AstraDB
|
|
186
|
+
|
|
187
|
+
collection_name = self.connector_config.collection_name
|
|
188
|
+
embedding_dimension = self.write_config.embedding_dimension
|
|
189
|
+
|
|
190
|
+
# If the user has requested an indexing policy, pass it to the AstraDB
|
|
191
|
+
requested_indexing_policy = self.write_config.requested_indexing_policy
|
|
192
|
+
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
|
|
193
|
+
|
|
194
|
+
# caller_name/version for AstraDB tracking
|
|
195
|
+
self._astra_db = AstraDB(
|
|
196
|
+
api_endpoint=self.connector_config.access_config.api_endpoint,
|
|
197
|
+
token=self.connector_config.access_config.token,
|
|
198
|
+
namespace=self.connector_config.namespace,
|
|
199
|
+
caller_name=integration_name,
|
|
200
|
+
caller_version=integration_version,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Create and connect to the newly created collection
|
|
204
|
+
self._astra_db_collection = self._astra_db.create_collection(
|
|
205
|
+
collection_name=collection_name,
|
|
206
|
+
dimension=embedding_dimension,
|
|
207
|
+
options=options,
|
|
208
|
+
)
|
|
209
|
+
return self._astra_db_collection
|
|
210
|
+
|
|
211
|
+
@requires_dependencies(["astrapy"], extras="astra")
|
|
212
|
+
@DestinationConnectionError.wrap
|
|
213
|
+
def initialize(self):
|
|
214
|
+
_ = self.astra_db_collection
|
|
215
|
+
|
|
216
|
+
@requires_dependencies(["astrapy"], extras="astra")
|
|
217
|
+
def check_connection(self):
|
|
218
|
+
try:
|
|
219
|
+
_ = self.astra_db_collection
|
|
220
|
+
except Exception as e:
|
|
221
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
222
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
223
|
+
|
|
224
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
225
|
+
logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra.")
|
|
226
|
+
|
|
227
|
+
astra_batch_size = self.write_config.batch_size
|
|
228
|
+
|
|
229
|
+
for batch in batch_generator(elements_dict, astra_batch_size):
|
|
230
|
+
self._astra_db_collection.insert_many(batch)
|
|
231
|
+
|
|
232
|
+
def normalize_dict(self, element_dict: dict) -> dict:
|
|
233
|
+
return {
|
|
234
|
+
"$vector": element_dict.pop("embeddings", None),
|
|
235
|
+
"content": element_dict.pop("text", None),
|
|
236
|
+
"metadata": element_dict,
|
|
237
|
+
}
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import typing as t
|
|
3
|
+
import uuid
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
7
|
+
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
8
|
+
from unstructured_ingest.interfaces import (
|
|
9
|
+
AccessConfig,
|
|
10
|
+
BaseConnectorConfig,
|
|
11
|
+
BaseDestinationConnector,
|
|
12
|
+
WriteConfig,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
|
|
17
|
+
if t.TYPE_CHECKING:
|
|
18
|
+
from azure.search.documents import SearchClient
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class AzureCognitiveSearchAccessConfig(AccessConfig):
|
|
23
|
+
key: str = enhanced_field(sensitive=True)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class SimpleAzureCognitiveSearchStorageConfig(BaseConnectorConfig):
|
|
28
|
+
endpoint: str
|
|
29
|
+
access_config: AzureCognitiveSearchAccessConfig
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class AzureCognitiveSearchWriteConfig(WriteConfig):
|
|
34
|
+
index: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
|
|
39
|
+
write_config: AzureCognitiveSearchWriteConfig
|
|
40
|
+
connector_config: SimpleAzureCognitiveSearchStorageConfig
|
|
41
|
+
_client: t.Optional["SearchClient"] = field(init=False, default=None)
|
|
42
|
+
|
|
43
|
+
@requires_dependencies(["azure.search"], extras="azure-cognitive-search")
|
|
44
|
+
def generate_client(self) -> "SearchClient":
|
|
45
|
+
from azure.core.credentials import AzureKeyCredential
|
|
46
|
+
from azure.search.documents import SearchClient
|
|
47
|
+
|
|
48
|
+
# Create a client
|
|
49
|
+
credential = AzureKeyCredential(self.connector_config.access_config.key)
|
|
50
|
+
return SearchClient(
|
|
51
|
+
endpoint=self.connector_config.endpoint,
|
|
52
|
+
index_name=self.write_config.index,
|
|
53
|
+
credential=credential,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def client(self) -> "SearchClient":
|
|
58
|
+
if self._client is None:
|
|
59
|
+
self._client = self.generate_client()
|
|
60
|
+
return self._client
|
|
61
|
+
|
|
62
|
+
def check_connection(self):
|
|
63
|
+
try:
|
|
64
|
+
self.client.get_document_count()
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
67
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
68
|
+
|
|
69
|
+
def initialize(self):
|
|
70
|
+
_ = self.client
|
|
71
|
+
|
|
72
|
+
def conform_dict(self, data: dict) -> None:
|
|
73
|
+
"""
|
|
74
|
+
updates the dictionary that is from each Element being converted into a dict/json
|
|
75
|
+
into a dictionary that conforms to the schema expected by the
|
|
76
|
+
Azure Cognitive Search index
|
|
77
|
+
"""
|
|
78
|
+
from dateutil import parser # type: ignore
|
|
79
|
+
|
|
80
|
+
data["id"] = str(uuid.uuid4())
|
|
81
|
+
|
|
82
|
+
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
83
|
+
data["metadata"]["coordinates"]["points"] = json.dumps(points)
|
|
84
|
+
if version := data.get("metadata", {}).get("data_source", {}).get("version"):
|
|
85
|
+
data["metadata"]["data_source"]["version"] = str(version)
|
|
86
|
+
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
|
|
87
|
+
data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
|
|
88
|
+
if permissions_data := (
|
|
89
|
+
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
|
90
|
+
):
|
|
91
|
+
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
|
|
92
|
+
if links := data.get("metadata", {}).get("links"):
|
|
93
|
+
data["metadata"]["links"] = [json.dumps(link) for link in links]
|
|
94
|
+
if last_modified := data.get("metadata", {}).get("last_modified"):
|
|
95
|
+
data["metadata"]["last_modified"] = parser.parse(last_modified).strftime(
|
|
96
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
97
|
+
)
|
|
98
|
+
if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
|
|
99
|
+
data["metadata"]["data_source"]["date_created"] = parser.parse(date_created).strftime(
|
|
100
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
101
|
+
)
|
|
102
|
+
if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
|
|
103
|
+
data["metadata"]["data_source"]["date_modified"] = parser.parse(date_modified).strftime(
|
|
104
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
105
|
+
)
|
|
106
|
+
if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
|
|
107
|
+
data["metadata"]["data_source"]["date_processed"] = parser.parse(
|
|
108
|
+
date_processed,
|
|
109
|
+
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
110
|
+
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
111
|
+
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
|
|
112
|
+
if page_number := data.get("metadata", {}).get("page_number"):
|
|
113
|
+
data["metadata"]["page_number"] = str(page_number)
|
|
114
|
+
|
|
115
|
+
@requires_dependencies(["azure"], extras="azure-cognitive-search")
|
|
116
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
117
|
+
import azure.core.exceptions
|
|
118
|
+
|
|
119
|
+
logger.info(
|
|
120
|
+
f"writing {len(elements_dict)} documents to destination "
|
|
121
|
+
f"index at {self.write_config.index}",
|
|
122
|
+
)
|
|
123
|
+
try:
|
|
124
|
+
results = self.client.upload_documents(documents=elements_dict)
|
|
125
|
+
|
|
126
|
+
except azure.core.exceptions.HttpResponseError as http_error:
|
|
127
|
+
raise WriteError(f"http error: {http_error}") from http_error
|
|
128
|
+
errors = []
|
|
129
|
+
success = []
|
|
130
|
+
for result in results:
|
|
131
|
+
if result.succeeded:
|
|
132
|
+
success.append(result)
|
|
133
|
+
else:
|
|
134
|
+
errors.append(result)
|
|
135
|
+
logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
|
|
136
|
+
if errors:
|
|
137
|
+
raise WriteError(
|
|
138
|
+
", ".join(
|
|
139
|
+
[
|
|
140
|
+
f"{error.key}: [{error.status_code}] {error.error_message}"
|
|
141
|
+
for error in errors
|
|
142
|
+
],
|
|
143
|
+
),
|
|
144
|
+
)
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import typing as t
|
|
3
|
+
import urllib.request
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from ftplib import FTP, error_perm
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
from requests.adapters import HTTPAdapter
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
12
|
+
from unstructured_ingest.interfaces import (
|
|
13
|
+
BaseConnectorConfig,
|
|
14
|
+
BaseSingleIngestDoc,
|
|
15
|
+
BaseSourceConnector,
|
|
16
|
+
IngestDocCleanupMixin,
|
|
17
|
+
SourceConnectorCleanupMixin,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.logger import logger
|
|
20
|
+
from unstructured_ingest.utils.data_prep import (
|
|
21
|
+
validate_date_args,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
DOMAIN = "ftp.ncbi.nlm.nih.gov"
|
|
25
|
+
FTP_DOMAIN = f"ftp://{DOMAIN}"
|
|
26
|
+
PMC_DIR = "pub/pmc"
|
|
27
|
+
PDF_DIR = "oa_pdf"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class BiomedFileMeta:
|
|
32
|
+
ftp_path: str
|
|
33
|
+
download_filepath: str
|
|
34
|
+
output_filepath: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class SimpleBiomedConfig(BaseConnectorConfig):
|
|
39
|
+
"""Connector config where path is the FTP directory path and
|
|
40
|
+
id_, from_, until, format are API parameters."""
|
|
41
|
+
|
|
42
|
+
path: t.Optional[str] = None
|
|
43
|
+
# OA Web Service API Options
|
|
44
|
+
api_id: t.Optional[str] = None
|
|
45
|
+
api_from: t.Optional[str] = None
|
|
46
|
+
api_until: t.Optional[str] = None
|
|
47
|
+
max_request_time: int = 45
|
|
48
|
+
|
|
49
|
+
def validate_api_inputs(self):
|
|
50
|
+
valid = False
|
|
51
|
+
|
|
52
|
+
if self.api_from:
|
|
53
|
+
valid = validate_date_args(self.api_from)
|
|
54
|
+
|
|
55
|
+
if self.api_until:
|
|
56
|
+
valid = validate_date_args(self.api_until)
|
|
57
|
+
|
|
58
|
+
return valid
|
|
59
|
+
|
|
60
|
+
def __post_init__(self):
|
|
61
|
+
self.is_file = False
|
|
62
|
+
self.is_dir = False
|
|
63
|
+
self.is_api = False
|
|
64
|
+
|
|
65
|
+
if not self.path:
|
|
66
|
+
is_valid = self.validate_api_inputs()
|
|
67
|
+
if not is_valid:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
"Path argument or at least one of the "
|
|
70
|
+
"OA Web Service arguments MUST be provided.",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
self.is_api = True
|
|
74
|
+
else:
|
|
75
|
+
self.path = self.path.strip("/")
|
|
76
|
+
is_valid = self.path.lower().startswith(PDF_DIR)
|
|
77
|
+
|
|
78
|
+
if not is_valid:
|
|
79
|
+
raise ValueError(f"Path MUST start with {PDF_DIR}")
|
|
80
|
+
|
|
81
|
+
ftp = FTP(DOMAIN)
|
|
82
|
+
ftp.login()
|
|
83
|
+
|
|
84
|
+
path = Path(PMC_DIR) / self.path
|
|
85
|
+
response = ""
|
|
86
|
+
try:
|
|
87
|
+
if path.suffix == ".pdf":
|
|
88
|
+
response = ftp.cwd(str(path.parent))
|
|
89
|
+
self.is_file = True
|
|
90
|
+
else:
|
|
91
|
+
response = ftp.cwd(str(path))
|
|
92
|
+
except error_perm as exc:
|
|
93
|
+
if "no such file or directory" in exc.args[0].lower():
|
|
94
|
+
raise ValueError(f"The path: {path} is not valid.")
|
|
95
|
+
elif "not a directory" in exc.args[0].lower():
|
|
96
|
+
self.is_file = True
|
|
97
|
+
elif "command successful" in response:
|
|
98
|
+
self.is_dir = True
|
|
99
|
+
else:
|
|
100
|
+
raise ValueError(
|
|
101
|
+
"Something went wrong when validating the path: {path}.",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class BiomedIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
107
|
+
connector_config: SimpleBiomedConfig
|
|
108
|
+
file_meta: BiomedFileMeta
|
|
109
|
+
registry_name: str = "biomed"
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def filename(self):
|
|
113
|
+
return Path(self.file_meta.download_filepath).resolve() # type: ignore
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def _output_filename(self):
|
|
117
|
+
return Path(f"{self.file_meta.output_filepath}.json").resolve()
|
|
118
|
+
|
|
119
|
+
def cleanup_file(self):
|
|
120
|
+
if (
|
|
121
|
+
not self.read_config.preserve_downloads
|
|
122
|
+
and self.filename.is_file()
|
|
123
|
+
and not self.read_config.download_only
|
|
124
|
+
):
|
|
125
|
+
logger.debug(f"Cleaning up {self}")
|
|
126
|
+
Path.unlink(self.filename)
|
|
127
|
+
|
|
128
|
+
@SourceConnectionError.wrap
|
|
129
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
130
|
+
def get_file(self):
|
|
131
|
+
download_path = self.file_meta.download_filepath # type: ignore
|
|
132
|
+
dir_ = Path(os.path.dirname(download_path)) # type: ignore
|
|
133
|
+
if not dir_.is_dir():
|
|
134
|
+
logger.debug(f"Creating directory: {dir_}")
|
|
135
|
+
|
|
136
|
+
if dir_:
|
|
137
|
+
dir_.mkdir(parents=True, exist_ok=True)
|
|
138
|
+
self._retrieve()
|
|
139
|
+
logger.debug(f"File downloaded: {self.file_meta.download_filepath}")
|
|
140
|
+
|
|
141
|
+
@SourceConnectionNetworkError.wrap
|
|
142
|
+
def _retrieve(self):
|
|
143
|
+
urllib.request.urlretrieve(
|
|
144
|
+
self.file_meta.ftp_path, # type: ignore
|
|
145
|
+
self.file_meta.download_filepath,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
150
|
+
"""Objects of this class support fetching documents from Biomedical literature FTP directory"""
|
|
151
|
+
|
|
152
|
+
connector_config: SimpleBiomedConfig
|
|
153
|
+
|
|
154
|
+
def get_base_endpoints_url(self) -> str:
|
|
155
|
+
endpoint_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?format=pdf"
|
|
156
|
+
|
|
157
|
+
if self.connector_config.api_id:
|
|
158
|
+
endpoint_url += f"&id={self.connector_config.api_id}"
|
|
159
|
+
|
|
160
|
+
if self.connector_config.api_from:
|
|
161
|
+
endpoint_url += f"&from={self.connector_config.api_from}"
|
|
162
|
+
|
|
163
|
+
if self.connector_config.api_until:
|
|
164
|
+
endpoint_url += f"&until={self.connector_config.api_until}"
|
|
165
|
+
|
|
166
|
+
return endpoint_url
|
|
167
|
+
|
|
168
|
+
def _list_objects_api(self) -> t.List[BiomedFileMeta]:
|
|
169
|
+
from bs4 import BeautifulSoup
|
|
170
|
+
|
|
171
|
+
def urls_to_metadata(urls):
|
|
172
|
+
files = []
|
|
173
|
+
for url in urls:
|
|
174
|
+
parts = url.split(PDF_DIR)
|
|
175
|
+
if len(parts) > 1:
|
|
176
|
+
local_path = parts[1].strip("/")
|
|
177
|
+
files.append(
|
|
178
|
+
BiomedFileMeta(
|
|
179
|
+
ftp_path=url,
|
|
180
|
+
download_filepath=(Path(self.read_config.download_dir) / local_path)
|
|
181
|
+
.resolve()
|
|
182
|
+
.as_posix(),
|
|
183
|
+
output_filepath=(Path(self.processor_config.output_dir) / local_path)
|
|
184
|
+
.resolve()
|
|
185
|
+
.as_posix(),
|
|
186
|
+
),
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return files
|
|
190
|
+
|
|
191
|
+
files: t.List[BiomedFileMeta] = []
|
|
192
|
+
|
|
193
|
+
endpoint_url = self.get_base_endpoints_url()
|
|
194
|
+
|
|
195
|
+
while endpoint_url:
|
|
196
|
+
session = requests.Session()
|
|
197
|
+
adapter = HTTPAdapter()
|
|
198
|
+
session.mount("http://", adapter)
|
|
199
|
+
session.mount("https://", adapter)
|
|
200
|
+
response = self._get_request(session=session, endpoint_url=endpoint_url)
|
|
201
|
+
soup = BeautifulSoup(response.content, features="lxml")
|
|
202
|
+
urls = [link["href"] for link in soup.find_all("link")]
|
|
203
|
+
|
|
204
|
+
if not urls:
|
|
205
|
+
return files
|
|
206
|
+
|
|
207
|
+
endpoint_url = urls[-1] if "resumptiontoken" in urls[-1].lower() else None
|
|
208
|
+
if endpoint_url:
|
|
209
|
+
urls = urls[:-1]
|
|
210
|
+
|
|
211
|
+
files.extend(urls_to_metadata(urls))
|
|
212
|
+
|
|
213
|
+
return files
|
|
214
|
+
|
|
215
|
+
@SourceConnectionNetworkError.wrap
|
|
216
|
+
def _get_request(self, session: requests.Session, endpoint_url: str) -> requests.Response:
|
|
217
|
+
return session.get(endpoint_url, timeout=self.connector_config.max_request_time)
|
|
218
|
+
|
|
219
|
+
def _list_objects(self) -> t.List[BiomedFileMeta]:
|
|
220
|
+
files = []
|
|
221
|
+
|
|
222
|
+
# Conform to mypy, null check performed elsewhere.
|
|
223
|
+
# Wouldn't be in this method unless self.config.path exists
|
|
224
|
+
path: str = self.connector_config.path if self.connector_config.path else ""
|
|
225
|
+
|
|
226
|
+
def traverse(path, download_dir, output_dir):
|
|
227
|
+
full_path = Path(PMC_DIR) / path
|
|
228
|
+
logger.debug(f"Traversing directory: {full_path}")
|
|
229
|
+
|
|
230
|
+
ftp = FTP(DOMAIN)
|
|
231
|
+
ftp.login()
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
response = ftp.cwd(str(full_path))
|
|
235
|
+
except error_perm:
|
|
236
|
+
raise ValueError(f"{full_path} is not a valid directory.")
|
|
237
|
+
|
|
238
|
+
if "command successful" in response.lower():
|
|
239
|
+
sub_paths = [path / p for p in ftp.nlst()]
|
|
240
|
+
|
|
241
|
+
if not sub_paths:
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
ext = Path(sub_paths[0]).suffix
|
|
245
|
+
if ext:
|
|
246
|
+
for sub_path in sub_paths:
|
|
247
|
+
ftp_path = f"{FTP_DOMAIN}/{PMC_DIR}/{sub_path}"
|
|
248
|
+
local_path = "/".join(str(sub_path).split("/")[1:])
|
|
249
|
+
files.append(
|
|
250
|
+
BiomedFileMeta(
|
|
251
|
+
ftp_path=ftp_path,
|
|
252
|
+
download_filepath=(Path(self.read_config.download_dir) / local_path)
|
|
253
|
+
.resolve()
|
|
254
|
+
.as_posix(),
|
|
255
|
+
output_filepath=(
|
|
256
|
+
Path(self.processor_config.output_dir) / local_path
|
|
257
|
+
)
|
|
258
|
+
.resolve()
|
|
259
|
+
.as_posix(),
|
|
260
|
+
),
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
else:
|
|
264
|
+
for sub_path in sub_paths:
|
|
265
|
+
traverse(sub_path, download_dir, output_dir)
|
|
266
|
+
|
|
267
|
+
else:
|
|
268
|
+
raise ValueError(f"{full_path} is not a valid directory.")
|
|
269
|
+
|
|
270
|
+
ftp_path = f"{FTP_DOMAIN}/{PMC_DIR}/{self.connector_config.path}"
|
|
271
|
+
if self.connector_config.is_file:
|
|
272
|
+
local_path = "/".join(path.split("/")[1:])
|
|
273
|
+
return [
|
|
274
|
+
BiomedFileMeta(
|
|
275
|
+
ftp_path=ftp_path,
|
|
276
|
+
download_filepath=(Path(self.read_config.download_dir) / local_path)
|
|
277
|
+
.resolve()
|
|
278
|
+
.as_posix(),
|
|
279
|
+
output_filepath=(Path(self.processor_config.output_dir) / local_path)
|
|
280
|
+
.resolve()
|
|
281
|
+
.as_posix(),
|
|
282
|
+
),
|
|
283
|
+
]
|
|
284
|
+
else:
|
|
285
|
+
traverse(
|
|
286
|
+
Path(path),
|
|
287
|
+
Path(self.read_config.download_dir),
|
|
288
|
+
Path(self.processor_config.output_dir),
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
return files
|
|
292
|
+
|
|
293
|
+
def initialize(self):
|
|
294
|
+
pass
|
|
295
|
+
|
|
296
|
+
def check_connection(self):
|
|
297
|
+
resp = requests.head(self.get_base_endpoints_url())
|
|
298
|
+
try:
|
|
299
|
+
resp.raise_for_status()
|
|
300
|
+
except requests.HTTPError as http_error:
|
|
301
|
+
raise SourceConnectionError(f"failed to validate connection: {http_error}")
|
|
302
|
+
|
|
303
|
+
def get_ingest_docs(self):
|
|
304
|
+
files = self._list_objects_api() if self.connector_config.is_api else self._list_objects()
|
|
305
|
+
return [
|
|
306
|
+
BiomedIngestDoc(
|
|
307
|
+
processor_config=self.processor_config,
|
|
308
|
+
connector_config=self.connector_config,
|
|
309
|
+
read_config=self.read_config,
|
|
310
|
+
file_meta=file,
|
|
311
|
+
)
|
|
312
|
+
for file in files
|
|
313
|
+
]
|