unstructured-ingest 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +32 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astra.py +99 -0
- unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +656 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astra.py +237 -0
- unstructured_ingest/connector/azure_cognitive_search.py +144 -0
- unstructured_ingest/connector/biomed.py +313 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +173 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +349 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +294 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +285 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +233 -0
- unstructured_ingest/connector/notion/connector.py +468 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +95 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +72 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +44 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +248 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/evaluate.py +338 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +838 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +265 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +130 -0
- unstructured_ingest/pipeline/reformat/embedding.py +66 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astra.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astra.py +22 -0
- unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/compression.py +117 -0
- unstructured_ingest/utils/data_prep.py +112 -0
- unstructured_ingest/utils/dep_check.py +66 -0
- unstructured_ingest/utils/string_and_date_utils.py +39 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +215 -0
- unstructured_ingest/v2/cli/base/dest.py +76 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +70 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
- unstructured_ingest/v2/cli/cmds/astra.py +85 -0
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
- unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
- unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
- unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
- unstructured_ingest/v2/cli/cmds/local.py +60 -0
- unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
- unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
- unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
- unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
- unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
- unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
- unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
- unstructured_ingest/v2/cli/cmds/sql.py +84 -0
- unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
- unstructured_ingest/v2/cli/configs/__init__.py +6 -0
- unstructured_ingest/v2/cli/configs/chunk.py +89 -0
- unstructured_ingest/v2/cli/configs/embed.py +74 -0
- unstructured_ingest/v2/cli/configs/partition.py +99 -0
- unstructured_ingest/v2/cli/configs/processor.py +88 -0
- unstructured_ingest/v2/cli/interfaces.py +27 -0
- unstructured_ingest/v2/cli/utils.py +240 -0
- unstructured_ingest/v2/example.py +37 -0
- unstructured_ingest/v2/interfaces/__init__.py +29 -0
- unstructured_ingest/v2/interfaces/connector.py +32 -0
- unstructured_ingest/v2/interfaces/downloader.py +79 -0
- unstructured_ingest/v2/interfaces/file_data.py +49 -0
- unstructured_ingest/v2/interfaces/indexer.py +28 -0
- unstructured_ingest/v2/interfaces/process.py +20 -0
- unstructured_ingest/v2/interfaces/processor.py +48 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
- unstructured_ingest/v2/interfaces/uploader.py +39 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +167 -0
- unstructured_ingest/v2/pipeline/pipeline.py +284 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
- unstructured_ingest/v2/pipeline/steps/download.py +124 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
- unstructured_ingest/v2/pipeline/steps/index.py +61 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
- unstructured_ingest/v2/pipeline/utils.py +15 -0
- unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured_ingest/v2/processes/chunker.py +97 -0
- unstructured_ingest/v2/processes/connector_registry.py +63 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
- unstructured_ingest/v2/processes/connectors/astra.py +152 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
- unstructured_ingest/v2/processes/connectors/local.py +204 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
- unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
- unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
- unstructured_ingest/v2/processes/connectors/sql.py +269 -0
- unstructured_ingest/v2/processes/connectors/utils.py +19 -0
- unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
- unstructured_ingest/v2/processes/embedder.py +76 -0
- unstructured_ingest/v2/processes/partitioner.py +166 -0
- unstructured_ingest/v2/processes/uncompress.py +43 -0
- unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
- unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
- unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
- unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import typing as t
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime as dt
|
|
5
|
+
from multiprocessing import Process
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
9
|
+
from unstructured_ingest.interfaces import (
|
|
10
|
+
BaseConnectorConfig,
|
|
11
|
+
BaseDestinationConnector,
|
|
12
|
+
BaseSingleIngestDoc,
|
|
13
|
+
BaseSourceConnector,
|
|
14
|
+
IngestDocCleanupMixin,
|
|
15
|
+
SourceConnectorCleanupMixin,
|
|
16
|
+
SourceMetadata,
|
|
17
|
+
WriteConfig,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.logger import logger
|
|
20
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
|
+
|
|
22
|
+
if t.TYPE_CHECKING:
|
|
23
|
+
from deltalake import DeltaTable
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class SimpleDeltaTableConfig(BaseConnectorConfig):
|
|
28
|
+
table_uri: t.Union[str, Path]
|
|
29
|
+
version: t.Optional[int] = None
|
|
30
|
+
storage_options: t.Optional[t.Dict[str, str]] = None
|
|
31
|
+
without_files: bool = False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class DeltaTableIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
36
|
+
connector_config: SimpleDeltaTableConfig
|
|
37
|
+
uri: str
|
|
38
|
+
modified_date: str
|
|
39
|
+
created_at: str
|
|
40
|
+
registry_name: str = "delta-table"
|
|
41
|
+
|
|
42
|
+
def uri_filename(self) -> str:
|
|
43
|
+
basename = os.path.basename(self.uri)
|
|
44
|
+
return os.path.splitext(basename)[0]
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def filename(self):
|
|
48
|
+
return (Path(self.read_config.download_dir) / f"{self.uri_filename()}.csv").resolve()
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def _output_filename(self):
|
|
52
|
+
"""Create filename document id combined with a hash of the query to uniquely identify
|
|
53
|
+
the output file."""
|
|
54
|
+
return Path(self.processor_config.output_dir) / f"{self.uri_filename()}.json"
|
|
55
|
+
|
|
56
|
+
def _create_full_tmp_dir_path(self):
|
|
57
|
+
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
self._output_filename.parent.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
|
|
60
|
+
@requires_dependencies(["fsspec"], extras="delta-table")
|
|
61
|
+
def _get_fs_from_uri(self):
|
|
62
|
+
from fsspec.core import url_to_fs
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
fs, _ = url_to_fs(self.uri)
|
|
66
|
+
except ImportError as error:
|
|
67
|
+
raise ImportError(
|
|
68
|
+
f"uri {self.uri} may be associated with a filesystem that "
|
|
69
|
+
f"requires additional dependencies: {error}",
|
|
70
|
+
)
|
|
71
|
+
return fs
|
|
72
|
+
|
|
73
|
+
def update_source_metadata(self, **kwargs):
|
|
74
|
+
fs = kwargs.get("fs", self._get_fs_from_uri())
|
|
75
|
+
version = (
|
|
76
|
+
fs.checksum(self.uri) if fs.protocol != "gs" else fs.info(self.uri).get("etag", "")
|
|
77
|
+
)
|
|
78
|
+
file_exists = fs.exists(self.uri)
|
|
79
|
+
self.source_metadata = SourceMetadata(
|
|
80
|
+
date_created=self.created_at,
|
|
81
|
+
date_modified=self.modified_date,
|
|
82
|
+
version=version,
|
|
83
|
+
source_url=self.uri,
|
|
84
|
+
exists=file_exists,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
@SourceConnectionError.wrap
|
|
88
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
89
|
+
def get_file(self):
|
|
90
|
+
fs = self._get_fs_from_uri()
|
|
91
|
+
self.update_source_metadata(fs=fs)
|
|
92
|
+
logger.info(f"using a {fs} filesystem to collect table data")
|
|
93
|
+
self._create_full_tmp_dir_path()
|
|
94
|
+
|
|
95
|
+
df = self._get_df(filesystem=fs)
|
|
96
|
+
|
|
97
|
+
logger.info(f"writing {len(df)} rows to {self.filename}")
|
|
98
|
+
df.to_csv(self.filename)
|
|
99
|
+
|
|
100
|
+
@SourceConnectionNetworkError.wrap
|
|
101
|
+
def _get_df(self, filesystem):
|
|
102
|
+
import pyarrow.parquet as pq
|
|
103
|
+
|
|
104
|
+
return pq.ParquetDataset(self.uri, filesystem=filesystem).read_pandas().to_pandas()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class DeltaTableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
109
|
+
connector_config: SimpleDeltaTableConfig
|
|
110
|
+
delta_table: t.Optional["DeltaTable"] = None
|
|
111
|
+
|
|
112
|
+
def check_connection(self):
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
116
|
+
def initialize(self):
|
|
117
|
+
from deltalake import DeltaTable
|
|
118
|
+
|
|
119
|
+
self.delta_table = DeltaTable(
|
|
120
|
+
table_uri=self.connector_config.table_uri,
|
|
121
|
+
version=self.connector_config.version,
|
|
122
|
+
storage_options=self.connector_config.storage_options,
|
|
123
|
+
without_files=self.connector_config.without_files,
|
|
124
|
+
)
|
|
125
|
+
rows = self.delta_table.to_pyarrow_dataset().count_rows()
|
|
126
|
+
if not rows > 0:
|
|
127
|
+
raise ValueError(f"no data found at {self.connector_config.table_uri}")
|
|
128
|
+
logger.info(f"processing {rows} rows of data")
|
|
129
|
+
|
|
130
|
+
def get_ingest_docs(self):
|
|
131
|
+
"""Batches the results into distinct docs"""
|
|
132
|
+
if not self.delta_table:
|
|
133
|
+
raise ValueError("delta table was never initialized")
|
|
134
|
+
actions = self.delta_table.get_add_actions().to_pandas()
|
|
135
|
+
mod_date_dict = {
|
|
136
|
+
row["path"]: str(row["modification_time"]) for _, row in actions.iterrows()
|
|
137
|
+
}
|
|
138
|
+
created_at = dt.fromtimestamp(self.delta_table.metadata().created_time / 1000)
|
|
139
|
+
return [
|
|
140
|
+
DeltaTableIngestDoc(
|
|
141
|
+
connector_config=self.connector_config,
|
|
142
|
+
processor_config=self.processor_config,
|
|
143
|
+
read_config=self.read_config,
|
|
144
|
+
uri=uri,
|
|
145
|
+
modified_date=mod_date_dict[os.path.basename(uri)],
|
|
146
|
+
created_at=str(created_at),
|
|
147
|
+
)
|
|
148
|
+
for uri in self.delta_table.file_uris()
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@dataclass
|
|
153
|
+
class DeltaTableWriteConfig(WriteConfig):
|
|
154
|
+
drop_empty_cols: bool = False
|
|
155
|
+
mode: t.Literal["error", "append", "overwrite", "ignore"] = "error"
|
|
156
|
+
schema_mode: t.Optional[t.Literal["merge", "overwrite"]] = None
|
|
157
|
+
engine: t.Literal["pyarrow", "rust"] = "pyarrow"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class DeltaTableDestinationConnector(BaseDestinationConnector):
|
|
162
|
+
write_config: DeltaTableWriteConfig
|
|
163
|
+
connector_config: SimpleDeltaTableConfig
|
|
164
|
+
|
|
165
|
+
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
166
|
+
def initialize(self):
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
def check_connection(self):
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
173
|
+
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
174
|
+
from deltalake.writer import write_deltalake
|
|
175
|
+
|
|
176
|
+
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
177
|
+
|
|
178
|
+
df = convert_to_pandas_dataframe(
|
|
179
|
+
elements_dict=elements_dict,
|
|
180
|
+
drop_empty_cols=self.write_config.drop_empty_cols,
|
|
181
|
+
)
|
|
182
|
+
logger.info(
|
|
183
|
+
f"writing {len(df)} rows to destination table "
|
|
184
|
+
f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}",
|
|
185
|
+
)
|
|
186
|
+
writer_kwargs = {
|
|
187
|
+
"table_or_uri": self.connector_config.table_uri,
|
|
188
|
+
"data": df,
|
|
189
|
+
"mode": self.write_config.mode,
|
|
190
|
+
"engine": self.write_config.engine,
|
|
191
|
+
}
|
|
192
|
+
if self.write_config.schema_mode is not None:
|
|
193
|
+
writer_kwargs["schema_mode"] = self.write_config.schema_mode
|
|
194
|
+
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
|
195
|
+
# ingest to fail, even though all tasks are completed normally. Putting the writer into a
|
|
196
|
+
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's
|
|
197
|
+
# rust backend to finish
|
|
198
|
+
writer = Process(
|
|
199
|
+
target=write_deltalake,
|
|
200
|
+
kwargs=writer_kwargs,
|
|
201
|
+
)
|
|
202
|
+
writer.start()
|
|
203
|
+
writer.join()
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
import typing as t
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
7
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
8
|
+
from unstructured_ingest.interfaces import (
|
|
9
|
+
AccessConfig,
|
|
10
|
+
BaseConnectorConfig,
|
|
11
|
+
BaseSingleIngestDoc,
|
|
12
|
+
BaseSourceConnector,
|
|
13
|
+
IngestDocCleanupMixin,
|
|
14
|
+
SourceConnectorCleanupMixin,
|
|
15
|
+
SourceMetadata,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.logger import logger
|
|
18
|
+
from unstructured_ingest.utils.dep_check import (
|
|
19
|
+
requires_dependencies,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class DiscordAccessConfig(AccessConfig):
|
|
25
|
+
token: str = enhanced_field(sensitive=True)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class SimpleDiscordConfig(BaseConnectorConfig):
|
|
30
|
+
"""Connector config where channels is a comma separated list of
|
|
31
|
+
Discord channels to pull messages from.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
# Discord Specific Options
|
|
35
|
+
access_config: DiscordAccessConfig
|
|
36
|
+
channels: t.List[str]
|
|
37
|
+
period: t.Optional[int] = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class DiscordIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
42
|
+
"""Class encapsulating fetching a doc and writing processed results (but not
|
|
43
|
+
doing the processing!).
|
|
44
|
+
Also includes a cleanup method. When things go wrong and the cleanup
|
|
45
|
+
method is not called, the file is left behind on the filesystem to assist debugging.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
connector_config: SimpleDiscordConfig
|
|
49
|
+
channel: str
|
|
50
|
+
days: t.Optional[int] = None
|
|
51
|
+
registry_name: str = "discord"
|
|
52
|
+
|
|
53
|
+
# NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file
|
|
54
|
+
# __post_init__ for multiprocessing simplicity (no Path objects in initially
|
|
55
|
+
# instantiated object)
|
|
56
|
+
def _tmp_download_file(self):
|
|
57
|
+
channel_file = self.channel + ".txt"
|
|
58
|
+
return Path(self.read_config.download_dir) / channel_file
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def _output_filename(self):
|
|
62
|
+
output_file = self.channel + ".json"
|
|
63
|
+
return Path(self.processor_config.output_dir) / output_file
|
|
64
|
+
|
|
65
|
+
def _create_full_tmp_dir_path(self):
|
|
66
|
+
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
|
|
68
|
+
@SourceConnectionNetworkError.wrap
|
|
69
|
+
@requires_dependencies(dependencies=["discord"], extras="discord")
|
|
70
|
+
def _get_messages(self):
|
|
71
|
+
"""Actually fetches the data from discord."""
|
|
72
|
+
import discord
|
|
73
|
+
from discord.ext import commands
|
|
74
|
+
|
|
75
|
+
messages: t.List[discord.Message] = []
|
|
76
|
+
jumpurl: t.List[str] = []
|
|
77
|
+
intents = discord.Intents.default()
|
|
78
|
+
intents.message_content = True
|
|
79
|
+
bot = commands.Bot(command_prefix=">", intents=intents)
|
|
80
|
+
|
|
81
|
+
@bot.event
|
|
82
|
+
async def on_ready():
|
|
83
|
+
try:
|
|
84
|
+
after_date = None
|
|
85
|
+
if self.days:
|
|
86
|
+
after_date = dt.datetime.utcnow() - dt.timedelta(days=self.days)
|
|
87
|
+
channel = bot.get_channel(int(self.channel))
|
|
88
|
+
jumpurl.append(channel.jump_url) # type: ignore
|
|
89
|
+
async for msg in channel.history(after=after_date): # type: ignore
|
|
90
|
+
messages.append(msg)
|
|
91
|
+
await bot.close()
|
|
92
|
+
except Exception:
|
|
93
|
+
logger.error("Error fetching messages")
|
|
94
|
+
await bot.close()
|
|
95
|
+
raise
|
|
96
|
+
|
|
97
|
+
bot.run(self.connector_config.access_config.token)
|
|
98
|
+
jump_url = None if len(jumpurl) < 1 else jumpurl[0]
|
|
99
|
+
return messages, jump_url
|
|
100
|
+
|
|
101
|
+
def update_source_metadata(self, **kwargs):
|
|
102
|
+
messages, jump_url = kwargs.get("messages_tuple", self._get_messages())
|
|
103
|
+
if messages == []:
|
|
104
|
+
self.source_metadata = SourceMetadata(
|
|
105
|
+
exists=False,
|
|
106
|
+
)
|
|
107
|
+
return
|
|
108
|
+
dates = [m.created_at for m in messages if m.created_at]
|
|
109
|
+
dates.sort()
|
|
110
|
+
self.source_metadata = SourceMetadata(
|
|
111
|
+
date_created=dates[0].isoformat(),
|
|
112
|
+
date_modified=dates[-1].isoformat(),
|
|
113
|
+
source_url=jump_url,
|
|
114
|
+
exists=True,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
@SourceConnectionError.wrap
|
|
118
|
+
@BaseSingleIngestDoc.skip_if_file_exists
|
|
119
|
+
def get_file(self):
|
|
120
|
+
self._create_full_tmp_dir_path()
|
|
121
|
+
|
|
122
|
+
messages, jump_url = self._get_messages()
|
|
123
|
+
self.update_source_metadata(messages_tuple=(messages, jump_url))
|
|
124
|
+
if messages == []:
|
|
125
|
+
raise ValueError(f"Failed to retrieve messages from Discord channel {self.channel}")
|
|
126
|
+
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
|
127
|
+
with open(self._tmp_download_file(), "w") as f:
|
|
128
|
+
for m in messages:
|
|
129
|
+
f.write(m.content + "\n")
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def filename(self):
|
|
133
|
+
"""The filename of the file created from a discord channel"""
|
|
134
|
+
return self._tmp_download_file()
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def version(self) -> t.Optional[str]:
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
|
142
|
+
return {
|
|
143
|
+
"channel": self.channel,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class DiscordSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
148
|
+
"""Objects of this class support fetching document(s) from"""
|
|
149
|
+
|
|
150
|
+
connector_config: SimpleDiscordConfig
|
|
151
|
+
|
|
152
|
+
def initialize(self):
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
@requires_dependencies(dependencies=["discord"], extras="discord")
|
|
156
|
+
def check_connection(self):
|
|
157
|
+
import asyncio
|
|
158
|
+
|
|
159
|
+
import discord
|
|
160
|
+
from discord.client import Client
|
|
161
|
+
|
|
162
|
+
intents = discord.Intents.default()
|
|
163
|
+
try:
|
|
164
|
+
client = Client(intents=intents)
|
|
165
|
+
asyncio.run(client.start(token=self.connector_config.access_config.token))
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
168
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
169
|
+
|
|
170
|
+
def get_ingest_docs(self):
|
|
171
|
+
return [
|
|
172
|
+
DiscordIngestDoc(
|
|
173
|
+
connector_config=self.connector_config,
|
|
174
|
+
processor_config=self.processor_config,
|
|
175
|
+
read_config=self.read_config,
|
|
176
|
+
channel=channel,
|
|
177
|
+
days=self.connector_config.period,
|
|
178
|
+
)
|
|
179
|
+
for channel in self.connector_config.channels
|
|
180
|
+
]
|