unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
import socket
|
|
4
|
+
import typing as t
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
10
|
+
from unstructured_ingest.interfaces import (
|
|
11
|
+
AccessConfig,
|
|
12
|
+
BaseConnectorConfig,
|
|
13
|
+
BaseDestinationConnector,
|
|
14
|
+
BaseIngestDoc,
|
|
15
|
+
BaseSingleIngestDoc,
|
|
16
|
+
BaseSourceConnector,
|
|
17
|
+
ConfigSessionHandleMixin,
|
|
18
|
+
IngestDocCleanupMixin,
|
|
19
|
+
IngestDocSessionHandleMixin,
|
|
20
|
+
SourceConnectorCleanupMixin,
|
|
21
|
+
WriteConfig,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.logger import logger
|
|
24
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
25
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
26
|
+
|
|
27
|
+
if t.TYPE_CHECKING:
|
|
28
|
+
from confluent_kafka import Consumer, Producer
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class KafkaAccessConfig(AccessConfig):
|
|
33
|
+
kafka_api_key: t.Optional[str] = enhanced_field(sensitive=True)
|
|
34
|
+
secret: t.Optional[str] = enhanced_field(sensitive=True)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class SimpleKafkaConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
|
|
39
|
+
bootstrap_server: str
|
|
40
|
+
port: str
|
|
41
|
+
topic: str
|
|
42
|
+
access_config: KafkaAccessConfig
|
|
43
|
+
confluent: t.Optional[bool] = True
|
|
44
|
+
num_messages_to_consume: t.Optional[int] = 1
|
|
45
|
+
timeout: t.Optional[float] = 1.0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class KafkaIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
50
|
+
"""Class encapsulating fetching a message and writing processed results."""
|
|
51
|
+
|
|
52
|
+
connector_config: SimpleKafkaConfig
|
|
53
|
+
raw_content: str
|
|
54
|
+
raw_filename: str
|
|
55
|
+
registry_name: str = "kafka"
|
|
56
|
+
|
|
57
|
+
def _tmp_download_file(self):
|
|
58
|
+
topic_file = self.connector_config.topic + "-" + self.raw_filename
|
|
59
|
+
return Path(self.read_config.download_dir) / topic_file
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def version(self) -> t.Optional[str]:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def source_url(self) -> t.Optional[str]:
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def filename(self):
|
|
71
|
+
"""The filename of the file created"""
|
|
72
|
+
return self._tmp_download_file()
|
|
73
|
+
|
|
74
|
+
def _create_full_tmp_dir_path(self):
|
|
75
|
+
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def _output_filename(self):
|
|
79
|
+
"""Create filename document id combined with a hash of the query to uniquely identify
|
|
80
|
+
the output file."""
|
|
81
|
+
output_file = self.connector_config.topic + ".json"
|
|
82
|
+
return Path(self.processor_config.output_dir) / output_file
|
|
83
|
+
|
|
84
|
+
@SourceConnectionError.wrap
|
|
85
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
86
|
+
def get_file(self):
|
|
87
|
+
self._create_full_tmp_dir_path()
|
|
88
|
+
|
|
89
|
+
pdf_data = base64.b64decode(self.raw_content)
|
|
90
|
+
|
|
91
|
+
with open(self.filename, "wb") as file:
|
|
92
|
+
file.write(pdf_data)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
97
|
+
"""Source connector for Kafka.
|
|
98
|
+
Main job is to consume from a Kafka topic and create instances of
|
|
99
|
+
KakfaIngestDoc.
|
|
100
|
+
Note that messages have the format of:
|
|
101
|
+
<filename>: the name of the file (with correct file extension)
|
|
102
|
+
<content>: base64 encoded (whether was binary or not)
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
connector_config: SimpleKafkaConfig
|
|
106
|
+
_consumer: t.Optional["Consumer"] = None
|
|
107
|
+
|
|
108
|
+
def check_connection(self):
|
|
109
|
+
try:
|
|
110
|
+
self.kafka_consumer
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
113
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
114
|
+
|
|
115
|
+
def initialize(self):
|
|
116
|
+
topic = self.connector_config.topic
|
|
117
|
+
logger.info(f"Subscribing to topic: {topic}")
|
|
118
|
+
self.kafka_consumer.subscribe([topic])
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def kafka_consumer(self):
|
|
122
|
+
if self._consumer is None:
|
|
123
|
+
self._consumer = self.create_consumer()
|
|
124
|
+
return self._consumer
|
|
125
|
+
|
|
126
|
+
@requires_dependencies(["confluent_kafka"], extras="kafka")
|
|
127
|
+
def create_consumer(self) -> "Consumer":
|
|
128
|
+
from confluent_kafka import Consumer
|
|
129
|
+
|
|
130
|
+
is_confluent = self.connector_config.confluent
|
|
131
|
+
bootstrap = self.connector_config.bootstrap_server
|
|
132
|
+
port = self.connector_config.port
|
|
133
|
+
|
|
134
|
+
conf = {
|
|
135
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
136
|
+
"client.id": socket.gethostname(),
|
|
137
|
+
"group.id": "your_group_id",
|
|
138
|
+
"enable.auto.commit": "false",
|
|
139
|
+
"auto.offset.reset": "earliest",
|
|
140
|
+
"message.max.bytes": 10485760,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if is_confluent:
|
|
144
|
+
kafka_api_key = self.connector_config.access_config.kafka_api_key
|
|
145
|
+
secret = self.connector_config.access_config.secret
|
|
146
|
+
conf["sasl.mechanism"] = "PLAIN"
|
|
147
|
+
conf["security.protocol"] = "SASL_SSL"
|
|
148
|
+
conf["sasl.username"] = kafka_api_key
|
|
149
|
+
conf["sasl.password"] = secret
|
|
150
|
+
|
|
151
|
+
consumer = Consumer(conf)
|
|
152
|
+
logger.debug(f"Kafka Consumer connected to bootstrap: {bootstrap}")
|
|
153
|
+
return consumer
|
|
154
|
+
|
|
155
|
+
@SourceConnectionError.wrap
|
|
156
|
+
def get_ingest_docs(self):
|
|
157
|
+
from confluent_kafka import KafkaError
|
|
158
|
+
|
|
159
|
+
consumer = self.kafka_consumer
|
|
160
|
+
running = True
|
|
161
|
+
|
|
162
|
+
collected = []
|
|
163
|
+
num_messages_to_consume = self.connector_config.num_messages_to_consume
|
|
164
|
+
logger.info(f"Config set for blocking on {num_messages_to_consume} messages")
|
|
165
|
+
# Consume specified number of messages
|
|
166
|
+
while running:
|
|
167
|
+
msg = consumer.poll(timeout=self.connector_config.timeout)
|
|
168
|
+
if msg is None:
|
|
169
|
+
logger.debug("No Kafka messages found")
|
|
170
|
+
continue
|
|
171
|
+
if msg.error():
|
|
172
|
+
if msg.error().code() == KafkaError._PARTITION_EOF:
|
|
173
|
+
# End of partition event
|
|
174
|
+
logger.error(
|
|
175
|
+
"%% %s [%d] reached end at offset %d\n"
|
|
176
|
+
% (msg.topic(), msg.partition(), msg.offset())
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
collected.append(json.loads(msg.value().decode("utf8")))
|
|
180
|
+
if len(collected) >= num_messages_to_consume:
|
|
181
|
+
logger.debug(f"Found {len(collected)} messages, stopping")
|
|
182
|
+
consumer.commit(asynchronous=False)
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
return [
|
|
186
|
+
KafkaIngestDoc(
|
|
187
|
+
connector_config=self.connector_config,
|
|
188
|
+
processor_config=self.processor_config,
|
|
189
|
+
read_config=self.read_config,
|
|
190
|
+
raw_filename=msg["filename"],
|
|
191
|
+
raw_content=msg["content"],
|
|
192
|
+
)
|
|
193
|
+
for msg in collected
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@dataclass
|
|
198
|
+
class KafkaWriteConfig(WriteConfig):
|
|
199
|
+
batch_size: int = 4
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@dataclass
|
|
203
|
+
class KafkaDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConnector):
|
|
204
|
+
"""Connector to write BaseIngestDoc types to Kafka
|
|
205
|
+
Writes messages to Kafka in the format:
|
|
206
|
+
"type"<type>
|
|
207
|
+
"text":<the partitioned text>
|
|
208
|
+
"filename":<name of the upstream file>
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
write_config: KafkaWriteConfig
|
|
212
|
+
connector_config: SimpleKafkaConfig
|
|
213
|
+
_producer: t.Optional["Producer"] = None
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
def kafka_producer(self):
|
|
217
|
+
if self._producer is None:
|
|
218
|
+
self._producer = self.create_producer()
|
|
219
|
+
return self._producer
|
|
220
|
+
|
|
221
|
+
def initialize(self):
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
@requires_dependencies(["confluent_kafka"], extras="kafka")
|
|
225
|
+
def create_producer(self) -> "Producer":
|
|
226
|
+
from confluent_kafka import Producer
|
|
227
|
+
|
|
228
|
+
is_confluent = self.connector_config.confluent
|
|
229
|
+
bootstrap = self.connector_config.bootstrap_server
|
|
230
|
+
port = self.connector_config.port
|
|
231
|
+
|
|
232
|
+
conf = {
|
|
233
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
234
|
+
"client.id": socket.gethostname(),
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
if is_confluent:
|
|
238
|
+
api_key = self.connector_config.access_config.kafka_api_key
|
|
239
|
+
secret = self.connector_config.access_config.secret
|
|
240
|
+
conf["sasl.mechanism"] = "PLAIN"
|
|
241
|
+
conf["security.protocol"] = "SASL_SSL"
|
|
242
|
+
conf["sasl.username"] = api_key
|
|
243
|
+
conf["sasl.password"] = secret
|
|
244
|
+
|
|
245
|
+
producer = Producer(conf)
|
|
246
|
+
logger.debug(f"Connected to bootstrap: {bootstrap}")
|
|
247
|
+
return producer
|
|
248
|
+
|
|
249
|
+
def check_connection(self):
|
|
250
|
+
try:
|
|
251
|
+
self.kafka_producer
|
|
252
|
+
except Exception as e:
|
|
253
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
254
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
255
|
+
|
|
256
|
+
@DestinationConnectionError.wrap
|
|
257
|
+
def upload_msg(self, batch) -> int:
|
|
258
|
+
logger.debug(f"Uploading batch: {batch}")
|
|
259
|
+
topic = self.connector_config.topic
|
|
260
|
+
producer = self.kafka_producer
|
|
261
|
+
uploaded = 0
|
|
262
|
+
for i in range(len(batch)):
|
|
263
|
+
filename = f'{batch[i].pop("filename")}'
|
|
264
|
+
producer.produce(topic, key=filename, value=str(batch[i]))
|
|
265
|
+
uploaded += 1
|
|
266
|
+
return uploaded
|
|
267
|
+
|
|
268
|
+
@DestinationConnectionError.wrap
|
|
269
|
+
def write_dict(self, *args, dict_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
270
|
+
logger.info(f"Writing {len(dict_list)} documents to Kafka")
|
|
271
|
+
num_uploaded = 0
|
|
272
|
+
|
|
273
|
+
for chunk in batch_generator(dict_list, self.write_config.batch_size):
|
|
274
|
+
num_uploaded += self.upload_msg(chunk) # noqa: E203
|
|
275
|
+
|
|
276
|
+
producer = self.kafka_producer
|
|
277
|
+
producer.flush()
|
|
278
|
+
logger.info(f"Uploaded {num_uploaded} documents to Kafka")
|
|
279
|
+
|
|
280
|
+
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
|
281
|
+
content_list: t.List[t.Dict[str, t.Any]] = []
|
|
282
|
+
for doc in docs:
|
|
283
|
+
local_path = doc._output_filename
|
|
284
|
+
with open(local_path) as json_file:
|
|
285
|
+
dict_content = json.load(json_file)
|
|
286
|
+
for content in dict_content:
|
|
287
|
+
content_list.append(
|
|
288
|
+
{
|
|
289
|
+
"type": content["type"],
|
|
290
|
+
"text": content["text"],
|
|
291
|
+
"filename": content["metadata"]["filename"],
|
|
292
|
+
}
|
|
293
|
+
)
|
|
294
|
+
self.write_dict(dict_list=content_list)
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
import glob
|
|
3
|
+
import os
|
|
4
|
+
import typing as t
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.interfaces import (
|
|
10
|
+
BaseConnectorConfig,
|
|
11
|
+
BaseSingleIngestDoc,
|
|
12
|
+
BaseSourceConnector,
|
|
13
|
+
SourceMetadata,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.logger import logger
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class SimpleLocalConfig(BaseConnectorConfig):
|
|
20
|
+
# Local specific options
|
|
21
|
+
input_path: str
|
|
22
|
+
recursive: bool = False
|
|
23
|
+
file_glob: t.Optional[t.List[str]] = None
|
|
24
|
+
|
|
25
|
+
def __post_init__(self):
|
|
26
|
+
if os.path.isfile(self.input_path):
|
|
27
|
+
self.input_path_is_file = True
|
|
28
|
+
else:
|
|
29
|
+
self.input_path_is_file = False
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class LocalIngestDoc(BaseSingleIngestDoc):
|
|
34
|
+
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
35
|
+
doing the processing!).
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
connector_config: SimpleLocalConfig
|
|
39
|
+
path: str
|
|
40
|
+
registry_name: str = "local"
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def base_filename(self) -> t.Optional[str]:
|
|
44
|
+
download_path = Path(self.connector_config.input_path).resolve()
|
|
45
|
+
full_path = Path(self.filename).resolve()
|
|
46
|
+
if download_path.is_file():
|
|
47
|
+
download_path = download_path.parent
|
|
48
|
+
relative_path = full_path.relative_to(download_path)
|
|
49
|
+
return str(relative_path)
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def filename(self):
|
|
53
|
+
"""The filename of the local file to be processed"""
|
|
54
|
+
return Path(self.path)
|
|
55
|
+
|
|
56
|
+
def cleanup_file(self):
|
|
57
|
+
"""Not applicable to local file system"""
|
|
58
|
+
|
|
59
|
+
def get_file(self):
|
|
60
|
+
"""Not applicable to local file system"""
|
|
61
|
+
|
|
62
|
+
def update_source_metadata(self, **kwargs) -> None:
|
|
63
|
+
try:
|
|
64
|
+
out = os.lstat(self.path)
|
|
65
|
+
self._source_metadata = SourceMetadata(
|
|
66
|
+
exists=True,
|
|
67
|
+
date_created=str(datetime.fromtimestamp(out.st_ctime)),
|
|
68
|
+
date_modified=str(datetime.fromtimestamp(out.st_mtime)),
|
|
69
|
+
permissions_data=[{"mode": out.st_mode}],
|
|
70
|
+
source_url=self.path,
|
|
71
|
+
)
|
|
72
|
+
except FileNotFoundError:
|
|
73
|
+
self._source_metadata = SourceMetadata(exists=False)
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def _output_filename(self) -> Path:
|
|
77
|
+
"""Returns output filename for the doc
|
|
78
|
+
If input path argument is a file itself, it returns the filename of the doc.
|
|
79
|
+
If input path argument is a folder, it returns the relative path of the doc.
|
|
80
|
+
"""
|
|
81
|
+
input_path = Path(self.connector_config.input_path)
|
|
82
|
+
basename = (
|
|
83
|
+
f"{self.base_filename}.json"
|
|
84
|
+
if input_path.is_file()
|
|
85
|
+
else f"{Path(self.path).relative_to(input_path)}.json"
|
|
86
|
+
)
|
|
87
|
+
return Path(self.processor_config.output_dir) / basename
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class LocalSourceConnector(BaseSourceConnector):
|
|
92
|
+
"""Objects of this class support fetching document(s) from local file system"""
|
|
93
|
+
|
|
94
|
+
def check_connection(self):
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
connector_config: SimpleLocalConfig
|
|
98
|
+
|
|
99
|
+
def __post_init__(self):
|
|
100
|
+
self.ingest_doc_cls: t.Type[LocalIngestDoc] = LocalIngestDoc
|
|
101
|
+
|
|
102
|
+
def cleanup(self, cur_dir=None):
|
|
103
|
+
"""Not applicable to local file system"""
|
|
104
|
+
|
|
105
|
+
def initialize(self):
|
|
106
|
+
"""Not applicable to local file system"""
|
|
107
|
+
|
|
108
|
+
def _list_files(self):
|
|
109
|
+
if self.connector_config.input_path_is_file:
|
|
110
|
+
return glob.glob(f"{self.connector_config.input_path}")
|
|
111
|
+
elif self.connector_config.recursive:
|
|
112
|
+
return glob.glob(
|
|
113
|
+
f"{self.connector_config.input_path}/**",
|
|
114
|
+
recursive=self.connector_config.recursive,
|
|
115
|
+
)
|
|
116
|
+
else:
|
|
117
|
+
return glob.glob(f"{self.connector_config.input_path}/*")
|
|
118
|
+
|
|
119
|
+
def does_path_match_glob(self, path: str) -> bool:
|
|
120
|
+
if self.connector_config.file_glob is None:
|
|
121
|
+
return True
|
|
122
|
+
patterns = self.connector_config.file_glob
|
|
123
|
+
for pattern in patterns:
|
|
124
|
+
if fnmatch.filter([path], pattern):
|
|
125
|
+
return True
|
|
126
|
+
logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
def get_ingest_docs(self):
|
|
130
|
+
return [
|
|
131
|
+
self.ingest_doc_cls(
|
|
132
|
+
connector_config=self.connector_config,
|
|
133
|
+
processor_config=self.processor_config,
|
|
134
|
+
read_config=self.read_config,
|
|
135
|
+
path=file,
|
|
136
|
+
)
|
|
137
|
+
for file in self._list_files()
|
|
138
|
+
if os.path.isfile(file) and self.does_path_match_glob(file)
|
|
139
|
+
]
|