unstructured-ingest 0.0.3__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/PKG-INFO +5 -2
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/setup.py +6 -2
- unstructured-ingest-0.0.4/test/test_chunking_utils.py +36 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/test/test_interfaces.py +2 -2
- unstructured-ingest-0.0.4/test/test_utils_v2.py +82 -0
- unstructured-ingest-0.0.4/unstructured_ingest/__version__.py +1 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cli.py +6 -1
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured-ingest-0.0.3/unstructured_ingest/cli/cmds/astra.py → unstructured-ingest-0.0.4/unstructured_ingest/cli/cmds/astradb.py +9 -9
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/interfaces.py +13 -6
- unstructured-ingest-0.0.3/unstructured_ingest/connector/astra.py → unstructured-ingest-0.0.4/unstructured_ingest/connector/astradb.py +29 -29
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/biomed.py +12 -5
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/confluence.py +3 -3
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/github.py +3 -2
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/google_drive.py +1 -2
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/mongodb.py +1 -2
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/client.py +31 -16
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/connector.py +3 -2
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/registry.py +2 -2
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/vectara.py +7 -2
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/interfaces.py +13 -9
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/interfaces.py +8 -3
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/__init__.py +2 -2
- unstructured-ingest-0.0.3/unstructured_ingest/runner/astra.py → unstructured-ingest-0.0.4/unstructured_ingest/runner/astradb.py +7 -7
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured-ingest-0.0.3/unstructured_ingest/runner/writers/astra.py → unstructured-ingest-0.0.4/unstructured_ingest/runner/writers/astradb.py +7 -7
- unstructured-ingest-0.0.4/unstructured_ingest/utils/chunking.py +45 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/utils/dep_check.py +1 -1
- unstructured-ingest-0.0.4/unstructured_ingest/utils/google_filetype.py +9 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/cli/base/cmd.py +57 -13
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/cli/base/dest.py +21 -12
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured-ingest-0.0.4/unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/utils.py → unstructured-ingest-0.0.4/unstructured_ingest/v2/cli/utils/click.py +36 -89
- unstructured-ingest-0.0.4/unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/interfaces/connector.py +5 -7
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/interfaces/downloader.py +8 -5
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/interfaces/file_data.py +8 -2
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/interfaces/indexer.py +3 -4
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/interfaces/processor.py +10 -10
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/interfaces/uploader.py +3 -3
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/pipeline.py +1 -5
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured-ingest-0.0.4/unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured-ingest-0.0.4/unstructured_ingest/v2/processes/chunker.py +143 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connector_registry.py +8 -2
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured-ingest-0.0.3/unstructured_ingest/v2/processes/connectors/astra.py → unstructured-ingest-0.0.4/unstructured_ingest/v2/processes/connectors/astradb.py +45 -35
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured-ingest-0.0.4/unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured-ingest-0.0.4/unstructured_ingest/v2/processes/connectors/databricks_volumes.py +160 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/local.py +22 -14
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured-ingest-0.0.4/unstructured_ingest/v2/processes/embedder.py +135 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/filter.py +11 -5
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/partitioner.py +79 -33
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured-ingest-0.0.4/unstructured_ingest/v2/utils.py +45 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest.egg-info/PKG-INFO +5 -2
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest.egg-info/SOURCES.txt +15 -40
- unstructured-ingest-0.0.4/unstructured_ingest.egg-info/requires.txt +607 -0
- unstructured-ingest-0.0.3/unstructured_ingest/__version__.py +0 -1
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured-ingest-0.0.3/unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured-ingest-0.0.3/unstructured_ingest/v2/processes/__init__.py +0 -0
- unstructured-ingest-0.0.3/unstructured_ingest/v2/processes/chunker.py +0 -97
- unstructured-ingest-0.0.3/unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -105
- unstructured-ingest-0.0.3/unstructured_ingest/v2/processes/embedder.py +0 -76
- unstructured-ingest-0.0.3/unstructured_ingest.egg-info/requires.txt +0 -208
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/README.md +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/pyproject.toml +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/setup.cfg +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/test/test_error.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/test/test_logger.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/test/test_utils.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/base/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/base/cmd.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/base/dest.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/base/src.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmd_factory.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/airtable.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/azure_cognitive_search.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/biomed.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/chroma.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/clarifai.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/confluence.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/databricks_volumes.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/delta_table.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/discord.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/elasticsearch.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/fsspec/azure.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/fsspec/box.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/fsspec/s3.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/github.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/gitlab.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/google_drive.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/hubspot.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/jira.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/kafka.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/local.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/mongodb.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/notion.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/onedrive.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/opensearch.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/outlook.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/pinecone.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/qdrant.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/reddit.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/salesforce.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/sharepoint.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/slack.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/sql.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/vectara.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/weaviate.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/wikipedia.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/common.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/utils.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/airtable.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/azure_cognitive_search.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/chroma.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/clarifai.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/databricks_volumes.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/delta_table.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/discord.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/elasticsearch.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/fsspec/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/fsspec/azure.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/fsspec/box.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/fsspec/dropbox.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/fsspec/fsspec.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/fsspec/gcs.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/fsspec/s3.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/fsspec/sftp.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/git.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/gitlab.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/hubspot.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/jira.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/kafka.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/local.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/helpers.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/interfaces.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/block.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/callout.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/child_page.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/code.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/divider.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/embed.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/equation.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/file.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/heading.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/image.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/link_to_page.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/paragraph.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/pdf.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/quote.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/table.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/template.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/todo.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/toggle.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/blocks/video.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/created_by.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/created_time.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/date.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/email.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/files.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/number.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/people.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/rich_text.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/rollup.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/select.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/status.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/title.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/url.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/database_properties/verification.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/date.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/file.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/page.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/parent.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/rich_text.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/notion/types/user.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/onedrive.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/opensearch.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/outlook.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/pinecone.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/qdrant.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/reddit.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/salesforce.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/sharepoint.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/slack.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/sql.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/weaviate.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/wikipedia.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/enhanced_dataclass/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/enhanced_dataclass/core.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/error.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/evaluate.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/ingest_backoff/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/ingest_backoff/_common.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/ingest_backoff/_wrapper.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/logger.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/main.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/copy.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/doc_factory.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/partition.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/permissions.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/pipeline.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/source.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/utils.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/pipeline/write.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/processor.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/airtable.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/base_runner.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/biomed.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/confluence.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/delta_table.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/discord.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/elasticsearch.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/fsspec/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/fsspec/azure.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/fsspec/box.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/fsspec/dropbox.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/fsspec/fsspec.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/fsspec/gcs.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/fsspec/s3.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/fsspec/sftp.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/github.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/gitlab.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/google_drive.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/hubspot.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/jira.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/kafka.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/local.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/mongodb.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/notion.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/onedrive.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/opensearch.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/outlook.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/reddit.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/salesforce.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/sharepoint.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/slack.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/utils.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/wikipedia.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/base_writer.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/chroma.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/clarifai.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/databricks_volumes.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/delta_table.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/elasticsearch.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/fsspec/azure.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/fsspec/box.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/fsspec/gcs.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/fsspec/s3.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/kafka.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/mongodb.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/opensearch.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/pinecone.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/qdrant.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/sql.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/vectara.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/runner/writers/weaviate.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/utils/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/utils/compression.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/utils/data_prep.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/utils/table.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/cli/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/cli/base/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/cli/base/importer.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/cli/cli.py +0 -0
- {unstructured-ingest-0.0.3/unstructured_ingest/v2/cli/cmds/fsspec → unstructured-ingest-0.0.4/unstructured_ingest/v2/cli/utils}/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/example.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/interfaces/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/interfaces/process.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/logger.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/main.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/interfaces.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/fsspec/utils.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/v2/processes/connectors/utils.py +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest.egg-info/dependency_links.txt +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest.egg-info/entry_points.txt +0 -0
- {unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
|
+
Provides-Extra: remote
|
|
24
25
|
Provides-Extra: csv
|
|
25
26
|
Provides-Extra: doc
|
|
26
27
|
Provides-Extra: docx
|
|
@@ -37,7 +38,7 @@ Provides-Extra: rst
|
|
|
37
38
|
Provides-Extra: tsv
|
|
38
39
|
Provides-Extra: xlsx
|
|
39
40
|
Provides-Extra: airtable
|
|
40
|
-
Provides-Extra:
|
|
41
|
+
Provides-Extra: astradb
|
|
41
42
|
Provides-Extra: azure
|
|
42
43
|
Provides-Extra: azure-cognitive-search
|
|
43
44
|
Provides-Extra: biomed
|
|
@@ -45,6 +46,7 @@ Provides-Extra: box
|
|
|
45
46
|
Provides-Extra: chroma
|
|
46
47
|
Provides-Extra: clarifai
|
|
47
48
|
Provides-Extra: confluence
|
|
49
|
+
Provides-Extra: couchbase
|
|
48
50
|
Provides-Extra: delta-table
|
|
49
51
|
Provides-Extra: discord
|
|
50
52
|
Provides-Extra: dropbox
|
|
@@ -75,6 +77,7 @@ Provides-Extra: wikipedia
|
|
|
75
77
|
Provides-Extra: weaviate
|
|
76
78
|
Provides-Extra: databricks-volumes
|
|
77
79
|
Provides-Extra: singlestore
|
|
80
|
+
Provides-Extra: vectara
|
|
78
81
|
Provides-Extra: embed-huggingface
|
|
79
82
|
Provides-Extra: embed-octoai
|
|
80
83
|
Provides-Extra: embed-vertexai
|
|
@@ -81,7 +81,7 @@ all_doc_reqs = list(
|
|
|
81
81
|
)
|
|
82
82
|
connectors_reqs = {
|
|
83
83
|
"airtable": load_requirements("requirements/connectors/airtable.in"),
|
|
84
|
-
"
|
|
84
|
+
"astradb": load_requirements("requirements/connectors/astradb.in"),
|
|
85
85
|
"azure": load_requirements("requirements/connectors/azure.in"),
|
|
86
86
|
"azure-cognitive-search": load_requirements(
|
|
87
87
|
"requirements/connectors/azure-cognitive-search.in",
|
|
@@ -91,6 +91,7 @@ connectors_reqs = {
|
|
|
91
91
|
"chroma": load_requirements("requirements/connectors/chroma.in"),
|
|
92
92
|
"clarifai": load_requirements("requirements/connectors/clarifai.in"),
|
|
93
93
|
"confluence": load_requirements("requirements/connectors/confluence.in"),
|
|
94
|
+
"couchbase": load_requirements("requirements/connectors/couchbase.in"),
|
|
94
95
|
"delta-table": load_requirements("requirements/connectors/delta-table.in"),
|
|
95
96
|
"discord": load_requirements("requirements/connectors/discord.in"),
|
|
96
97
|
"dropbox": load_requirements("requirements/connectors/dropbox.in"),
|
|
@@ -121,6 +122,7 @@ connectors_reqs = {
|
|
|
121
122
|
"weaviate": load_requirements("requirements/connectors/weaviate.in"),
|
|
122
123
|
"databricks-volumes": load_requirements("requirements/connectors/databricks-volumes.in"),
|
|
123
124
|
"singlestore": load_requirements("requirements/connectors/singlestore.in"),
|
|
125
|
+
"vectara": load_requirements("requirements/connectors/vectara.in"),
|
|
124
126
|
}
|
|
125
127
|
|
|
126
128
|
embed_reqs = {
|
|
@@ -150,7 +152,9 @@ docs_reqs = {
|
|
|
150
152
|
"xlsx": xlsx_reqs,
|
|
151
153
|
}
|
|
152
154
|
|
|
153
|
-
extras_require = {
|
|
155
|
+
extras_require = {
|
|
156
|
+
"remote": load_requirements("requirements/remote/client.in"),
|
|
157
|
+
}
|
|
154
158
|
for d in [docs_reqs, connectors_reqs, embed_reqs]:
|
|
155
159
|
extras_require.update(d)
|
|
156
160
|
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from unstructured.chunking import dispatch
|
|
5
|
+
from unstructured.documents.elements import assign_and_map_hash_ids
|
|
6
|
+
from unstructured.partition.auto import partition
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.utils.chunking import (
|
|
9
|
+
assign_and_map_hash_ids as new_assign_and_map_hash_ids,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
test_file_path = Path(__file__).resolve()
|
|
13
|
+
project_root = test_file_path.parent.parent
|
|
14
|
+
docs_path = project_root / "example-docs"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.mark.parametrize(
|
|
18
|
+
"chunking_strategy",
|
|
19
|
+
["basic", "by_title"],
|
|
20
|
+
)
|
|
21
|
+
def test_assign_and_map_hash_ids(chunking_strategy):
|
|
22
|
+
# Make sure the new logic working on dict content matches the
|
|
23
|
+
# results if using the unstructured version
|
|
24
|
+
file_path = docs_path / "book-war-and-peace-1p.txt"
|
|
25
|
+
elements = partition(filename=str(file_path.resolve()), strategy="fast")
|
|
26
|
+
chunked_elements = dispatch.chunk(elements=elements, chunking_strategy=chunking_strategy)
|
|
27
|
+
chunked_elements_copy = chunked_elements.copy()
|
|
28
|
+
|
|
29
|
+
hashed_chunked_elements = assign_and_map_hash_ids(chunked_elements)
|
|
30
|
+
og_chunked_elements_dicts = [e.to_dict() for e in hashed_chunked_elements]
|
|
31
|
+
|
|
32
|
+
new_chunked_elements_dicts = [e.to_dict() for e in chunked_elements_copy]
|
|
33
|
+
new_chunked_elements_dicts = new_assign_and_map_hash_ids(new_chunked_elements_dicts)
|
|
34
|
+
|
|
35
|
+
for e1, e2 in zip(og_chunked_elements_dicts, new_chunked_elements_dicts):
|
|
36
|
+
assert e1 == e2
|
|
@@ -90,7 +90,7 @@ class ExampleIngestDoc(BaseSingleIngestDoc):
|
|
|
90
90
|
pass
|
|
91
91
|
|
|
92
92
|
|
|
93
|
-
@pytest.fixture
|
|
93
|
+
@pytest.fixture
|
|
94
94
|
def partition_test_results():
|
|
95
95
|
# Reusable partition test results, calculated only once
|
|
96
96
|
result = partition(
|
|
@@ -107,7 +107,7 @@ def partition_test_results():
|
|
|
107
107
|
return result
|
|
108
108
|
|
|
109
109
|
|
|
110
|
-
@pytest.fixture
|
|
110
|
+
@pytest.fixture
|
|
111
111
|
def partition_file_test_results(partition_test_results):
|
|
112
112
|
# Reusable partition_file test results, calculated only once
|
|
113
113
|
return elements_to_dicts(partition_test_results)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, Secret, SecretStr
|
|
5
|
+
from pydantic.types import _SecretBase
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.utils import serialize_base_model, serialize_base_model_json
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MockChildBaseModel(BaseModel):
|
|
11
|
+
child_secret_str: SecretStr
|
|
12
|
+
child_secret_float: Secret[float]
|
|
13
|
+
child_not_secret_dict: dict[str, Any] = Field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MockBaseModel(BaseModel):
|
|
17
|
+
secret_str: SecretStr
|
|
18
|
+
not_secret_bool: bool
|
|
19
|
+
secret_child_base: Secret[MockChildBaseModel]
|
|
20
|
+
not_secret_list: list[int] = Field(default_factory=list)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
model = MockBaseModel(
|
|
24
|
+
secret_str="secret string",
|
|
25
|
+
not_secret_bool=False,
|
|
26
|
+
secret_child_base=MockChildBaseModel(
|
|
27
|
+
child_secret_str="child secret string",
|
|
28
|
+
child_secret_float=3.14,
|
|
29
|
+
child_not_secret_dict={"key": "value"},
|
|
30
|
+
),
|
|
31
|
+
not_secret_list=[1, 2, 3],
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_serialize_base_model():
|
|
36
|
+
|
|
37
|
+
serialized_dict = model.dict()
|
|
38
|
+
assert isinstance(serialized_dict["secret_str"], _SecretBase)
|
|
39
|
+
assert isinstance(serialized_dict["secret_child_base"], _SecretBase)
|
|
40
|
+
|
|
41
|
+
serialized_dict_w_secrets = serialize_base_model(model=model)
|
|
42
|
+
assert not isinstance(serialized_dict_w_secrets["secret_str"], _SecretBase)
|
|
43
|
+
assert not isinstance(serialized_dict_w_secrets["secret_child_base"], _SecretBase)
|
|
44
|
+
|
|
45
|
+
expected_dict = {
|
|
46
|
+
"secret_str": "secret string",
|
|
47
|
+
"not_secret_bool": False,
|
|
48
|
+
"secret_child_base": {
|
|
49
|
+
"child_secret_str": "child secret string",
|
|
50
|
+
"child_secret_float": 3.14,
|
|
51
|
+
"child_not_secret_dict": {"key": "value"},
|
|
52
|
+
},
|
|
53
|
+
"not_secret_list": [1, 2, 3],
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
assert serialized_dict_w_secrets == expected_dict
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_serialize_base_model_json():
|
|
60
|
+
serialized_json = model.json()
|
|
61
|
+
serialized_dict = json.loads(serialized_json)
|
|
62
|
+
expected_dict = {
|
|
63
|
+
"secret_str": "**********",
|
|
64
|
+
"not_secret_bool": False,
|
|
65
|
+
"secret_child_base": "**********",
|
|
66
|
+
"not_secret_list": [1, 2, 3],
|
|
67
|
+
}
|
|
68
|
+
assert expected_dict == serialized_dict
|
|
69
|
+
|
|
70
|
+
serialized_json_w_secrets = serialize_base_model_json(model=model)
|
|
71
|
+
serialized_dict_w_secrets = json.loads(serialized_json_w_secrets)
|
|
72
|
+
expected_dict_w_secrets = {
|
|
73
|
+
"secret_str": "secret string",
|
|
74
|
+
"not_secret_bool": False,
|
|
75
|
+
"secret_child_base": {
|
|
76
|
+
"child_secret_str": "child secret string",
|
|
77
|
+
"child_secret_float": 3.14,
|
|
78
|
+
"child_not_secret_dict": {"key": "value"},
|
|
79
|
+
},
|
|
80
|
+
"not_secret_list": [1, 2, 3],
|
|
81
|
+
}
|
|
82
|
+
assert expected_dict_w_secrets == serialized_dict_w_secrets
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.4" # pragma: no cover
|
|
@@ -1,16 +1,21 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
1
3
|
import click
|
|
2
4
|
|
|
3
5
|
from unstructured_ingest.cli import dest, src
|
|
4
6
|
from unstructured_ingest.v2.cli.cmds import dest as dest_v2
|
|
5
7
|
from unstructured_ingest.v2.cli.cmds import src as src_v2
|
|
6
8
|
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from click import Command
|
|
11
|
+
|
|
7
12
|
|
|
8
13
|
@click.group()
|
|
9
14
|
def ingest():
|
|
10
15
|
pass
|
|
11
16
|
|
|
12
17
|
|
|
13
|
-
def get_cmd() ->
|
|
18
|
+
def get_cmd() -> "Command":
|
|
14
19
|
"""Construct and return a Click command object representing the main command for the CLI.
|
|
15
20
|
|
|
16
21
|
This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
|
{unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/cmds/__init__.py
RENAMED
|
@@ -7,8 +7,8 @@ from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
|
7
7
|
from unstructured_ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd
|
|
8
8
|
|
|
9
9
|
from .airtable import get_base_src_cmd as airtable_base_src_cmd
|
|
10
|
-
from .
|
|
11
|
-
from .
|
|
10
|
+
from .astradb import get_base_dest_cmd as astradb_base_dest_cmd
|
|
11
|
+
from .astradb import get_base_src_cmd as astradb_base_src_cmd
|
|
12
12
|
from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd
|
|
13
13
|
from .biomed import get_base_src_cmd as biomed_base_src_cmd
|
|
14
14
|
from .chroma import get_base_dest_cmd as chroma_base_dest_cmd
|
|
@@ -63,7 +63,7 @@ if t.TYPE_CHECKING:
|
|
|
63
63
|
|
|
64
64
|
base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [
|
|
65
65
|
airtable_base_src_cmd,
|
|
66
|
-
|
|
66
|
+
astradb_base_src_cmd,
|
|
67
67
|
azure_base_src_cmd,
|
|
68
68
|
biomed_base_src_cmd,
|
|
69
69
|
box_base_src_cmd,
|
|
@@ -106,7 +106,7 @@ if src_duplicates:
|
|
|
106
106
|
)
|
|
107
107
|
|
|
108
108
|
base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
|
|
109
|
-
|
|
109
|
+
astradb_base_dest_cmd,
|
|
110
110
|
azure_base_dest_cmd,
|
|
111
111
|
box_base_dest_cmd,
|
|
112
112
|
chroma_base_dest_cmd,
|
|
@@ -4,11 +4,11 @@ from dataclasses import dataclass
|
|
|
4
4
|
import click
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.cli.interfaces import CliConfig, Dict
|
|
7
|
-
from unstructured_ingest.connector.
|
|
7
|
+
from unstructured_ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@dataclass
|
|
11
|
-
class
|
|
11
|
+
class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig):
|
|
12
12
|
@staticmethod
|
|
13
13
|
def get_cli_options() -> t.List[click.Option]:
|
|
14
14
|
options = [
|
|
@@ -48,7 +48,7 @@ class AstraCliConfig(SimpleAstraConfig, CliConfig):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
@dataclass
|
|
51
|
-
class
|
|
51
|
+
class AstraDBCliWriteConfig(AstraDBWriteConfig, CliConfig):
|
|
52
52
|
@staticmethod
|
|
53
53
|
def get_cli_options() -> t.List[click.Option]:
|
|
54
54
|
options = [
|
|
@@ -81,8 +81,8 @@ def get_base_src_cmd():
|
|
|
81
81
|
from unstructured_ingest.cli.base.src import BaseSrcCmd
|
|
82
82
|
|
|
83
83
|
cmd_cls = BaseSrcCmd(
|
|
84
|
-
cmd_name="
|
|
85
|
-
cli_config=
|
|
84
|
+
cmd_name="astradb",
|
|
85
|
+
cli_config=AstraDBCliConfig,
|
|
86
86
|
)
|
|
87
87
|
return cmd_cls
|
|
88
88
|
|
|
@@ -91,9 +91,9 @@ def get_base_dest_cmd():
|
|
|
91
91
|
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
92
92
|
|
|
93
93
|
cmd_cls = BaseDestCmd(
|
|
94
|
-
cmd_name="
|
|
95
|
-
cli_config=
|
|
96
|
-
additional_cli_options=[
|
|
97
|
-
write_config=
|
|
94
|
+
cmd_name="astradb",
|
|
95
|
+
cli_config=AstraDBCliConfig,
|
|
96
|
+
additional_cli_options=[AstraDBCliWriteConfig],
|
|
97
|
+
write_config=AstraDBWriteConfig,
|
|
98
98
|
)
|
|
99
99
|
return cmd_cls
|
{unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/cli/interfaces.py
RENAMED
|
@@ -11,7 +11,6 @@ from pathlib import Path
|
|
|
11
11
|
import click
|
|
12
12
|
from dataclasses_json.core import Json
|
|
13
13
|
from typing_extensions import Self
|
|
14
|
-
from unstructured.chunking import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT
|
|
15
14
|
|
|
16
15
|
from unstructured_ingest.interfaces import (
|
|
17
16
|
BaseConfig,
|
|
@@ -25,6 +24,9 @@ from unstructured_ingest.interfaces import (
|
|
|
25
24
|
RetryStrategyConfig,
|
|
26
25
|
)
|
|
27
26
|
|
|
27
|
+
CHUNK_MAX_CHARS_DEFAULT: int = 500
|
|
28
|
+
CHUNK_MULTI_PAGE_DEFAULT: bool = True
|
|
29
|
+
|
|
28
30
|
|
|
29
31
|
class Dict(click.ParamType):
|
|
30
32
|
name = "dict"
|
|
@@ -412,14 +414,19 @@ class CliFilesStorageConfig(FileStorageConfig, CliMixin):
|
|
|
412
414
|
class CliEmbeddingConfig(EmbeddingConfig, CliMixin):
|
|
413
415
|
@staticmethod
|
|
414
416
|
def get_cli_options() -> t.List[click.Option]:
|
|
415
|
-
|
|
416
|
-
|
|
417
|
+
embed_providers = [
|
|
418
|
+
"langchain-openai",
|
|
419
|
+
"langchain-huggingface",
|
|
420
|
+
"langchain-aws-bedrock",
|
|
421
|
+
"langchain-vertexai",
|
|
422
|
+
"langchain-voyageai",
|
|
423
|
+
"octoai",
|
|
424
|
+
]
|
|
417
425
|
options = [
|
|
418
426
|
click.Option(
|
|
419
427
|
["--embedding-provider"],
|
|
420
|
-
help="Type of the embedding class to be used.
|
|
421
|
-
|
|
422
|
-
type=click.Choice(list(EMBEDDING_PROVIDER_TO_CLASS_MAP)),
|
|
428
|
+
help="Type of the embedding class to be used.",
|
|
429
|
+
type=click.Choice(embed_providers),
|
|
423
430
|
),
|
|
424
431
|
click.Option(
|
|
425
432
|
["--embedding-api-key"],
|
|
@@ -30,23 +30,23 @@ NON_INDEXED_FIELDS = ["metadata._node_content", "content"]
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
@dataclass
|
|
33
|
-
class
|
|
33
|
+
class AstraDBAccessConfig(AccessConfig):
|
|
34
34
|
token: str = enhanced_field(sensitive=True)
|
|
35
35
|
api_endpoint: str = enhanced_field(sensitive=True)
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
@dataclass
|
|
39
|
-
class
|
|
40
|
-
access_config:
|
|
39
|
+
class SimpleAstraDBConfig(BaseConnectorConfig):
|
|
40
|
+
access_config: AstraDBAccessConfig
|
|
41
41
|
collection_name: str
|
|
42
42
|
namespace: t.Optional[str] = None
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
@dataclass
|
|
46
|
-
class
|
|
47
|
-
connector_config:
|
|
46
|
+
class AstraDBIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
47
|
+
connector_config: SimpleAstraDBConfig
|
|
48
48
|
metadata: t.Dict[str, str] = field(default_factory=dict)
|
|
49
|
-
registry_name: str = "
|
|
49
|
+
registry_name: str = "astradb"
|
|
50
50
|
|
|
51
51
|
@property
|
|
52
52
|
def filename(self):
|
|
@@ -75,7 +75,7 @@ class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
|
75
75
|
)
|
|
76
76
|
|
|
77
77
|
@SourceConnectionError.wrap
|
|
78
|
-
@requires_dependencies(["astrapy"], extras="
|
|
78
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
79
79
|
@BaseSingleIngestDoc.skip_if_file_exists
|
|
80
80
|
def get_file(self):
|
|
81
81
|
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -89,19 +89,19 @@ class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
@dataclass
|
|
92
|
-
class
|
|
93
|
-
connector_config:
|
|
92
|
+
class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
93
|
+
connector_config: SimpleAstraDBConfig
|
|
94
94
|
_astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
|
|
95
95
|
_astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
|
|
96
96
|
|
|
97
97
|
@property
|
|
98
|
-
@requires_dependencies(["astrapy"], extras="
|
|
98
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
99
99
|
def astra_db_collection(self) -> "AstraDBCollection":
|
|
100
100
|
if self._astra_db_collection is None:
|
|
101
101
|
from astrapy.db import AstraDB
|
|
102
102
|
|
|
103
103
|
# Build the Astra DB object.
|
|
104
|
-
# caller_name/version for
|
|
104
|
+
# caller_name/version for Astra DB tracking
|
|
105
105
|
self._astra_db = AstraDB(
|
|
106
106
|
api_endpoint=self.connector_config.access_config.api_endpoint,
|
|
107
107
|
token=self.connector_config.access_config.token,
|
|
@@ -116,12 +116,12 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
116
116
|
)
|
|
117
117
|
return self._astra_db_collection # type: ignore
|
|
118
118
|
|
|
119
|
-
@requires_dependencies(["astrapy"], extras="
|
|
119
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
120
120
|
@SourceConnectionError.wrap # type: ignore
|
|
121
121
|
def initialize(self):
|
|
122
122
|
_ = self.astra_db_collection
|
|
123
123
|
|
|
124
|
-
@requires_dependencies(["astrapy"], extras="
|
|
124
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
125
125
|
def check_connection(self):
|
|
126
126
|
try:
|
|
127
127
|
_ = self.astra_db_collection
|
|
@@ -129,14 +129,14 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
129
129
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
130
130
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
131
131
|
|
|
132
|
-
@requires_dependencies(["astrapy"], extras="
|
|
132
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
133
133
|
def get_ingest_docs(self): # type: ignore
|
|
134
134
|
# Perform the find operation
|
|
135
|
-
|
|
135
|
+
astra_db_docs = list(self.astra_db_collection.paginated_find())
|
|
136
136
|
|
|
137
137
|
doc_list = []
|
|
138
|
-
for record in
|
|
139
|
-
doc =
|
|
138
|
+
for record in astra_db_docs:
|
|
139
|
+
doc = AstraDBIngestDoc(
|
|
140
140
|
connector_config=self.connector_config,
|
|
141
141
|
processor_config=self.processor_config,
|
|
142
142
|
read_config=self.read_config,
|
|
@@ -151,16 +151,16 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
151
151
|
|
|
152
152
|
|
|
153
153
|
@dataclass
|
|
154
|
-
class
|
|
154
|
+
class AstraDBWriteConfig(WriteConfig):
|
|
155
155
|
embedding_dimension: int
|
|
156
156
|
requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None
|
|
157
157
|
batch_size: int = 20
|
|
158
158
|
|
|
159
159
|
|
|
160
160
|
@dataclass
|
|
161
|
-
class
|
|
162
|
-
write_config:
|
|
163
|
-
connector_config:
|
|
161
|
+
class AstraDBDestinationConnector(BaseDestinationConnector):
|
|
162
|
+
write_config: AstraDBWriteConfig
|
|
163
|
+
connector_config: SimpleAstraDBConfig
|
|
164
164
|
_astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
|
|
165
165
|
_astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
|
|
166
166
|
|
|
@@ -179,7 +179,7 @@ class AstraDestinationConnector(BaseDestinationConnector):
|
|
|
179
179
|
return _asdict(self_cp, **kwargs)
|
|
180
180
|
|
|
181
181
|
@property
|
|
182
|
-
@requires_dependencies(["astrapy"], extras="
|
|
182
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
183
183
|
def astra_db_collection(self) -> "AstraDBCollection":
|
|
184
184
|
if self._astra_db_collection is None:
|
|
185
185
|
from astrapy.db import AstraDB
|
|
@@ -187,11 +187,11 @@ class AstraDestinationConnector(BaseDestinationConnector):
|
|
|
187
187
|
collection_name = self.connector_config.collection_name
|
|
188
188
|
embedding_dimension = self.write_config.embedding_dimension
|
|
189
189
|
|
|
190
|
-
# If the user has requested an indexing policy, pass it to the
|
|
190
|
+
# If the user has requested an indexing policy, pass it to the Astra DB
|
|
191
191
|
requested_indexing_policy = self.write_config.requested_indexing_policy
|
|
192
192
|
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
|
|
193
193
|
|
|
194
|
-
# caller_name/version for
|
|
194
|
+
# caller_name/version for Astra DB tracking
|
|
195
195
|
self._astra_db = AstraDB(
|
|
196
196
|
api_endpoint=self.connector_config.access_config.api_endpoint,
|
|
197
197
|
token=self.connector_config.access_config.token,
|
|
@@ -208,12 +208,12 @@ class AstraDestinationConnector(BaseDestinationConnector):
|
|
|
208
208
|
)
|
|
209
209
|
return self._astra_db_collection
|
|
210
210
|
|
|
211
|
-
@requires_dependencies(["astrapy"], extras="
|
|
211
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
212
212
|
@DestinationConnectionError.wrap
|
|
213
213
|
def initialize(self):
|
|
214
214
|
_ = self.astra_db_collection
|
|
215
215
|
|
|
216
|
-
@requires_dependencies(["astrapy"], extras="
|
|
216
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
217
217
|
def check_connection(self):
|
|
218
218
|
try:
|
|
219
219
|
_ = self.astra_db_collection
|
|
@@ -222,11 +222,11 @@ class AstraDestinationConnector(BaseDestinationConnector):
|
|
|
222
222
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
223
223
|
|
|
224
224
|
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
225
|
-
logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra.")
|
|
225
|
+
logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra DB.")
|
|
226
226
|
|
|
227
|
-
|
|
227
|
+
astra_db_batch_size = self.write_config.batch_size
|
|
228
228
|
|
|
229
|
-
for batch in batch_generator(elements_dict,
|
|
229
|
+
for batch in batch_generator(elements_dict, astra_db_batch_size):
|
|
230
230
|
self._astra_db_collection.insert_many(batch)
|
|
231
231
|
|
|
232
232
|
def normalize_dict(self, element_dict: dict) -> dict:
|
{unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/biomed.py
RENAMED
|
@@ -5,9 +5,6 @@ from dataclasses import dataclass
|
|
|
5
5
|
from ftplib import FTP, error_perm
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
import requests
|
|
9
|
-
from requests.adapters import HTTPAdapter
|
|
10
|
-
|
|
11
8
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
12
9
|
from unstructured_ingest.interfaces import (
|
|
13
10
|
BaseConnectorConfig,
|
|
@@ -20,6 +17,10 @@ from unstructured_ingest.logger import logger
|
|
|
20
17
|
from unstructured_ingest.utils.data_prep import (
|
|
21
18
|
validate_date_args,
|
|
22
19
|
)
|
|
20
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
|
+
|
|
22
|
+
if t.TYPE_CHECKING:
|
|
23
|
+
from requests import Response, Session
|
|
23
24
|
|
|
24
25
|
DOMAIN = "ftp.ncbi.nlm.nih.gov"
|
|
25
26
|
FTP_DOMAIN = f"ftp://{DOMAIN}"
|
|
@@ -165,8 +166,11 @@ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
165
166
|
|
|
166
167
|
return endpoint_url
|
|
167
168
|
|
|
169
|
+
@requires_dependencies(["requests"], extras="biomed")
|
|
168
170
|
def _list_objects_api(self) -> t.List[BiomedFileMeta]:
|
|
169
171
|
from bs4 import BeautifulSoup
|
|
172
|
+
from requests import Session
|
|
173
|
+
from requests.adapters import HTTPAdapter
|
|
170
174
|
|
|
171
175
|
def urls_to_metadata(urls):
|
|
172
176
|
files = []
|
|
@@ -193,7 +197,7 @@ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
193
197
|
endpoint_url = self.get_base_endpoints_url()
|
|
194
198
|
|
|
195
199
|
while endpoint_url:
|
|
196
|
-
session =
|
|
200
|
+
session = Session()
|
|
197
201
|
adapter = HTTPAdapter()
|
|
198
202
|
session.mount("http://", adapter)
|
|
199
203
|
session.mount("https://", adapter)
|
|
@@ -213,7 +217,7 @@ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
213
217
|
return files
|
|
214
218
|
|
|
215
219
|
@SourceConnectionNetworkError.wrap
|
|
216
|
-
def _get_request(self, session:
|
|
220
|
+
def _get_request(self, session: "Session", endpoint_url: str) -> "Response":
|
|
217
221
|
return session.get(endpoint_url, timeout=self.connector_config.max_request_time)
|
|
218
222
|
|
|
219
223
|
def _list_objects(self) -> t.List[BiomedFileMeta]:
|
|
@@ -293,7 +297,10 @@ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
293
297
|
def initialize(self):
|
|
294
298
|
pass
|
|
295
299
|
|
|
300
|
+
@requires_dependencies(["requests"], extras="biomed")
|
|
296
301
|
def check_connection(self):
|
|
302
|
+
import requests
|
|
303
|
+
|
|
297
304
|
resp = requests.head(self.get_base_endpoints_url())
|
|
298
305
|
try:
|
|
299
306
|
resp.raise_for_status()
|
{unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/confluence.py
RENAMED
|
@@ -4,8 +4,6 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
|
|
7
|
-
import requests
|
|
8
|
-
|
|
9
7
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
8
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
11
9
|
from unstructured_ingest.interfaces import (
|
|
@@ -208,8 +206,10 @@ class ConfluenceSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
|
|
|
208
206
|
)
|
|
209
207
|
return self._confluence
|
|
210
208
|
|
|
211
|
-
@requires_dependencies(["atlassian"], extras="Confluence")
|
|
209
|
+
@requires_dependencies(["atlassian", "requests"], extras="Confluence")
|
|
212
210
|
def check_connection(self):
|
|
211
|
+
import requests
|
|
212
|
+
|
|
213
213
|
url = "rest/api/space"
|
|
214
214
|
try:
|
|
215
215
|
self.confluence.request(method="HEAD", path=url)
|
{unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/github.py
RENAMED
|
@@ -3,8 +3,6 @@ from dataclasses import dataclass
|
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
|
|
6
|
-
import requests
|
|
7
|
-
|
|
8
6
|
from unstructured_ingest.connector.git import (
|
|
9
7
|
GitIngestDoc,
|
|
10
8
|
GitSourceConnector,
|
|
@@ -71,7 +69,10 @@ class GitHubIngestDoc(GitIngestDoc):
|
|
|
71
69
|
return content_file
|
|
72
70
|
|
|
73
71
|
@SourceConnectionNetworkError.wrap
|
|
72
|
+
@requires_dependencies(["requests"], extras="github")
|
|
74
73
|
def _fetch_content(self, content_file):
|
|
74
|
+
import requests
|
|
75
|
+
|
|
75
76
|
contents = b""
|
|
76
77
|
if (
|
|
77
78
|
not content_file.content # type: ignore
|
|
@@ -7,8 +7,6 @@ from datetime import datetime
|
|
|
7
7
|
from mimetypes import guess_extension
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
|
|
10
|
-
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
11
|
-
|
|
12
10
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
13
11
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
14
12
|
from unstructured_ingest.interfaces import (
|
|
@@ -25,6 +23,7 @@ from unstructured_ingest.interfaces import (
|
|
|
25
23
|
)
|
|
26
24
|
from unstructured_ingest.logger import logger
|
|
27
25
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
26
|
+
from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
28
27
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
29
28
|
|
|
30
29
|
if t.TYPE_CHECKING:
|
{unstructured-ingest-0.0.3 → unstructured-ingest-0.0.4}/unstructured_ingest/connector/mongodb.py
RENAMED
|
@@ -3,8 +3,7 @@ import typing as t
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
|
|
6
|
+
from unstructured_ingest.__version__ import __version__ as unstructured_version
|
|
8
7
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
9
8
|
from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
10
9
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError, WriteError
|