PyPI - vector-inspector - Versions diffs - 0.2.7__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

vector-inspector 0.2.7py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

vector_inspector/config/__init__.py +4 -0
vector_inspector/config/known_embedding_models.json +432 -0
vector_inspector/core/connections/__init__.py +2 -1
vector_inspector/core/connections/base_connection.py +42 -1
vector_inspector/core/connections/chroma_connection.py +47 -11
vector_inspector/core/connections/pinecone_connection.py +768 -0
vector_inspector/core/embedding_providers/__init__.py +14 -0
vector_inspector/core/embedding_providers/base_provider.py +128 -0
vector_inspector/core/embedding_providers/clip_provider.py +260 -0
vector_inspector/core/embedding_providers/provider_factory.py +176 -0
vector_inspector/core/embedding_providers/sentence_transformer_provider.py +203 -0
vector_inspector/core/embedding_utils.py +69 -42
vector_inspector/core/model_registry.py +205 -0
vector_inspector/services/backup_restore_service.py +16 -0
vector_inspector/services/settings_service.py +117 -1
vector_inspector/ui/components/connection_manager_panel.py +7 -0
vector_inspector/ui/components/profile_manager_panel.py +61 -14
vector_inspector/ui/dialogs/__init__.py +2 -1
vector_inspector/ui/dialogs/cross_db_migration.py +20 -1
vector_inspector/ui/dialogs/embedding_config_dialog.py +166 -27
vector_inspector/ui/dialogs/provider_type_dialog.py +189 -0
vector_inspector/ui/main_window.py +33 -2
vector_inspector/ui/views/connection_view.py +55 -10
vector_inspector/ui/views/info_panel.py +83 -36
vector_inspector/ui/views/search_view.py +1 -1
vector_inspector/ui/views/visualization_view.py +19 -5
{vector_inspector-0.2.7.dist-info → vector_inspector-0.3.1.dist-info}/METADATA +1 -1
vector_inspector-0.3.1.dist-info/RECORD +55 -0
vector_inspector-0.2.7.dist-info/RECORD +0 -45
{vector_inspector-0.2.7.dist-info → vector_inspector-0.3.1.dist-info}/WHEEL +0 -0
{vector_inspector-0.2.7.dist-info → vector_inspector-0.3.1.dist-info}/entry_points.txt +0 -0

vector_inspector/config/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+"""Data package for Vector Inspector.
+Contains static data files like the model registry.
+"""

vector_inspector/config/known_embedding_models.json ADDED Viewed

@@ -0,0 +1,432 @@
+{
+    "models": [
+        {
+            "name": "all-MiniLM-L6-v2",
+            "type": "sentence-transformer",
+            "dimension": 384,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Fast, small-footprint text embeddings (good default for text search)"
+        },
+        {
+            "name": "openai/clip-vit-base-patch32",
+            "type": "clip",
+            "dimension": 512,
+            "modality": "multimodal",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Standard CLIP ViT-B/32 model; supports matching text ↔ images"
+        },
+        {
+            "name": "paraphrase-albert-small-v2",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Smaller paraphrase-specialized model"
+        },
+        {
+            "name": "all-mpnet-base-v2",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "High-quality text embeddings; recommended for semantic tasks"
+        },
+        {
+            "name": "all-roberta-large-v1",
+            "type": "sentence-transformer",
+            "dimension": 1024,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Large model — high quality, larger memory and compute"
+        },
+        {
+            "name": "gtr-t5-large",
+            "type": "sentence-transformer",
+            "dimension": 1536,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Very large embeddings useful for specialized high-recall tasks"
+        },
+        {
+            "name": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
+            "type": "sentence-transformer",
+            "dimension": 384,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Optimized for semantic search and question-answering tasks"
+        },
+        {
+            "name": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+            "type": "sentence-transformer",
+            "dimension": 384,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Multilingual support for 50+ languages"
+        },
+        {
+            "name": "sentence-transformers/msmarco-distilbert-base-v4",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Trained on MS MARCO dataset, good for passage retrieval"
+        },
+        {
+            "name": "sentence-transformers/all-distilroberta-v1",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Distilled RoBERTa model, balance of speed and quality"
+        },
+        {
+            "name": "sentence-transformers/paraphrase-mpnet-base-v2",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "High-quality paraphrase detection and semantic similarity"
+        },
+        {
+            "name": "BAAI/bge-small-en-v1.5",
+            "type": "sentence-transformer",
+            "dimension": 384,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Beijing Academy of AI model, strong performance for size"
+        },
+        {
+            "name": "BAAI/bge-base-en-v1.5",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "High-quality English embeddings, MTEB benchmark leader"
+        },
+        {
+            "name": "BAAI/bge-large-en-v1.5",
+            "type": "sentence-transformer",
+            "dimension": 1024,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Large model with excellent retrieval performance"
+        },
+        {
+            "name": "thenlper/gte-small",
+            "type": "sentence-transformer",
+            "dimension": 384,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "General Text Embeddings (GTE) small variant"
+        },
+        {
+            "name": "thenlper/gte-base",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "General Text Embeddings (GTE) base model"
+        },
+        {
+            "name": "thenlper/gte-large",
+            "type": "sentence-transformer",
+            "dimension": 1024,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "General Text Embeddings (GTE) large variant"
+        },
+        {
+            "name": "intfloat/e5-small-v2",
+            "type": "sentence-transformer",
+            "dimension": 384,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "E5 family small model, prefix with 'query: ' or 'passage: '"
+        },
+        {
+            "name": "intfloat/e5-base-v2",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "E5 family base model, strong asymmetric retrieval"
+        },
+        {
+            "name": "intfloat/e5-large-v2",
+            "type": "sentence-transformer",
+            "dimension": 1024,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "E5 family large model, top MTEB performance"
+        },
+        {
+            "name": "intfloat/multilingual-e5-small",
+            "type": "sentence-transformer",
+            "dimension": 384,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Multilingual E5 model supporting 100+ languages"
+        },
+        {
+            "name": "intfloat/multilingual-e5-base",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Multilingual E5 base model, excellent cross-lingual retrieval"
+        },
+        {
+            "name": "intfloat/multilingual-e5-large",
+            "type": "sentence-transformer",
+            "dimension": 1024,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Multilingual E5 large model, best-in-class multilingual embeddings"
+        },
+        {
+            "name": "openai/clip-vit-large-patch14",
+            "type": "clip",
+            "dimension": 768,
+            "modality": "multimodal",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Larger CLIP ViT-L/14 model, better quality than base"
+        },
+        {
+            "name": "openai/clip-vit-large-patch14-336",
+            "type": "clip",
+            "dimension": 768,
+            "modality": "multimodal",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Higher resolution (336x336) variant of ViT-L/14"
+        },
+        {
+            "name": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
+            "type": "clip",
+            "dimension": 512,
+            "modality": "multimodal",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "LAION's CLIP trained on 2B image-text pairs"
+        },
+        {
+            "name": "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+            "type": "clip",
+            "dimension": 1024,
+            "modality": "multimodal",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "LAION's huge CLIP model, excellent quality"
+        },
+        {
+            "name": "text-embedding-ada-002",
+            "type": "openai",
+            "dimension": 1536,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "openai-api",
+            "description": "OpenAI's production embedding model (legacy). Requires API key."
+        },
+        {
+            "name": "text-embedding-3-small",
+            "type": "openai",
+            "dimension": 1536,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "openai-api",
+            "description": "OpenAI's newer small model, better than ada-002. Requires API key."
+        },
+        {
+            "name": "text-embedding-3-large",
+            "type": "openai",
+            "dimension": 3072,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "openai-api",
+            "description": "OpenAI's large embedding model, highest quality. Requires API key."
+        },
+        {
+            "name": "embed-english-v3.0",
+            "type": "cohere",
+            "dimension": 1024,
+            "modality": "text",
+            "normalization": "none",
+            "source": "cohere-api",
+            "description": "Cohere's English embedding model. Requires API key."
+        },
+        {
+            "name": "embed-english-light-v3.0",
+            "type": "cohere",
+            "dimension": 384,
+            "modality": "text",
+            "normalization": "none",
+            "source": "cohere-api",
+            "description": "Cohere's lightweight English model. Requires API key."
+        },
+        {
+            "name": "embed-multilingual-v3.0",
+            "type": "cohere",
+            "dimension": 1024,
+            "modality": "text",
+            "normalization": "none",
+            "source": "cohere-api",
+            "description": "Cohere's multilingual model supporting 100+ languages. Requires API key."
+        },
+        {
+            "name": "embed-multilingual-light-v3.0",
+            "type": "cohere",
+            "dimension": 384,
+            "modality": "text",
+            "normalization": "none",
+            "source": "cohere-api",
+            "description": "Cohere's lightweight multilingual model. Requires API key."
+        },
+        {
+            "name": "textembedding-gecko@003",
+            "type": "vertex-ai",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "google-cloud",
+            "description": "Google's Gecko model for text embeddings. Requires Google Cloud credentials."
+        },
+        {
+            "name": "text-embedding-004",
+            "type": "vertex-ai",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "google-cloud",
+            "description": "Google's latest text embedding model. Requires Google Cloud credentials."
+        },
+        {
+            "name": "text-multilingual-embedding-002",
+            "type": "vertex-ai",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "google-cloud",
+            "description": "Google's multilingual embedding model. Requires Google Cloud credentials."
+        },
+        {
+            "name": "multimodalembedding@001",
+            "type": "vertex-ai",
+            "dimension": 1408,
+            "modality": "multimodal",
+            "normalization": "l2",
+            "source": "google-cloud",
+            "description": "Google's multimodal embedding model. Requires Google Cloud credentials."
+        },
+        {
+            "name": "voyage-large-2",
+            "type": "voyage",
+            "dimension": 1536,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "voyage-api",
+            "description": "Voyage AI's large model. Requires API key."
+        },
+        {
+            "name": "voyage-code-2",
+            "type": "voyage",
+            "dimension": 1536,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "voyage-api",
+            "description": "Voyage AI's code-optimized model. Requires API key."
+        },
+        {
+            "name": "voyage-2",
+            "type": "voyage",
+            "dimension": 1024,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "voyage-api",
+            "description": "Voyage AI's general-purpose model. Requires API key."
+        },
+        {
+            "name": "jinaai/jina-embeddings-v2-base-en",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Jina AI's 8k context length model, good for long documents"
+        },
+        {
+            "name": "jinaai/jina-embeddings-v2-small-en",
+            "type": "sentence-transformer",
+            "dimension": 512,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Jina AI's small model with 8k context length"
+        },
+        {
+            "name": "nomic-ai/nomic-embed-text-v1",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Nomic's open-source text embedding model with 8k context"
+        },
+        {
+            "name": "nomic-ai/nomic-embed-text-v1.5",
+            "type": "sentence-transformer",
+            "dimension": 768,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Nomic's improved model with better performance"
+        },
+        {
+            "name": "Alibaba-NLP/gte-Qwen2-7B-instruct",
+            "type": "sentence-transformer",
+            "dimension": 3584,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "Very large instruction-following embedding model, SOTA on many benchmarks"
+        },
+        {
+            "name": "nvidia/NV-Embed-v1",
+            "type": "sentence-transformer",
+            "dimension": 4096,
+            "modality": "text",
+            "normalization": "l2",
+            "source": "huggingface",
+            "description": "NVIDIA's embedding model, excellent for retrieval tasks"
+        }
+    ],
+    "metadata": {
+        "version": "1.0.0",
+        "last_updated": "2026-01-24",
+        "description": "Known embedding models registry for Vector Inspector"
+    }
+}

vector_inspector/core/connections/__init__.py CHANGED Viewed

@@ -3,5 +3,6 @@
 from .base_connection import VectorDBConnection
 from .chroma_connection import ChromaDBConnection
 from .qdrant_connection import QdrantConnection
+from .pinecone_connection import PineconeConnection
-__all__ = ["VectorDBConnection", "ChromaDBConnection", "QdrantConnection"]
+__all__ = ["VectorDBConnection", "ChromaDBConnection", "QdrantConnection", "PineconeConnection"]

vector_inspector/core/connections/base_connection.py CHANGED Viewed

@@ -229,5 +229,46 @@ class VectorDBConnection(ABC):
             {"name": "in", "server_side": True},
             {"name": "not in", "server_side": True},
             {"name": "contains", "server_side": False},
-            {"name": "not contains", "server_side": False},
         ]
+    def get_embedding_model(self, collection_name: str, connection_id: Optional[str] = None) -> Optional[str]:
+        """
+        Get the embedding model used for a collection.
+        Retrieves the model name from:
+        1. Collection-level metadata (if supported)
+        2. Vector metadata (_embedding_model field)
+        3. User settings (for collections we can't modify)
+        Args:
+            collection_name: Name of collection
+            connection_id: Optional connection ID for settings lookup
+        Returns:
+            Model name string (e.g., "sentence-transformers/all-MiniLM-L6-v2") or None
+        """
+        try:
+            # First try to get from collection-level metadata
+            info = self.get_collection_info(collection_name)
+            if info and info.get("embedding_model"):
+                return info["embedding_model"]
+            # Fall back to checking a sample vector's metadata
+            data = self.get_all_items(collection_name, limit=1, offset=0)
+            if data and data.get("metadatas") and len(data["metadatas"]) > 0:
+                metadata = data["metadatas"][0]
+                if "_embedding_model" in metadata:
+                    return metadata["_embedding_model"]
+            # Finally, check user settings (for collections we can't modify)
+            if connection_id:
+                from ...services.settings_service import SettingsService
+                settings = SettingsService()
+                model_info = settings.get_embedding_model(connection_id, collection_name)
+                if model_info:
+                    return model_info["model"]
+            return None
+        except Exception as e:
+            print(f"Failed to get embedding model: {e}")
+            return None

vector_inspector/core/connections/chroma_connection.py CHANGED Viewed

@@ -212,27 +212,37 @@ class ChromaDBConnection(VectorDBConnection):
             # ChromaDB uses cosine distance by default (or can be configured)
             # Try to get metadata from collection if available
             distance_metric = "Cosine (default)"
+            embedding_model = None
             try:
                 # ChromaDB collections may have metadata about distance function
                 col_metadata = collection.metadata
-                if col_metadata and "hnsw:space" in col_metadata:
-                    space = col_metadata["hnsw:space"]
-                    if space == "l2":
-                        distance_metric = "Euclidean (L2)"
-                    elif space == "ip":
-                        distance_metric = "Inner Product"
-                    elif space == "cosine":
-                        distance_metric = "Cosine"
+                if col_metadata:
+                    if "hnsw:space" in col_metadata:
+                        space = col_metadata["hnsw:space"]
+                        if space == "l2":
+                            distance_metric = "Euclidean (L2)"
+                        elif space == "ip":
+                            distance_metric = "Inner Product"
+                        elif space == "cosine":
+                            distance_metric = "Cosine"
+                    # Get embedding model if stored
+                    if "embedding_model" in col_metadata:
+                        embedding_model = col_metadata["embedding_model"]
             except:
                 pass  # Use default if unable to determine
-            return {
+            result = {
                 "name": name,
                 "count": count,
                 "metadata_fields": metadata_fields,
                 "vector_dimension": vector_dimension,
                 "distance_metric": distance_metric,
             }
+            if embedding_model:
+                result["embedding_model"] = embedding_model
+            return result
         except Exception as e:
             print(f"Failed to get collection info: {e}")
             return None
@@ -453,8 +463,34 @@ class ChromaDBConnection(VectorDBConnection):
     # Implement base connection uniform APIs
     def create_collection(self, name: str, vector_size: int, distance: str = "Cosine") -> bool:
-        """Create a collection. Chroma doesn't require vector size at creation."""
-        return self.get_collection(name) is not None
+        """Create a collection. If it doesn't exist, attempt to create it using Chroma client APIs."""
+        if not self._client:
+            return False
+        try:
+            # Prefer get_or_create_collection if available
+            if hasattr(self._client, "get_or_create_collection"):
+                col = self._client.get_or_create_collection(name=name)
+                self._current_collection = col
+                return True
+            # Fallback to create_collection/create and then fetch
+            if hasattr(self._client, "create_collection"):
+                try:
+                    self._client.create_collection(name=name)
+                except Exception:
+                    # Some clients may raise if already exists; ignore
+                    pass
+                col = self._client.get_collection(name=name)
+                self._current_collection = col
+                return col is not None
+            # As a last resort, check if collection exists
+            col = self.get_collection(name)
+            return col is not None
+        except Exception as e:
+            print(f"Failed to create collection: {e}")
+            return False
     def get_items(self, name: str, ids: List[str]) -> Dict[str, Any]:
         """Retrieve items by IDs."""

vector-inspector 0.2.7__py3-none-any.whl → 0.3.1__py3-none-any.whl

vector-inspector 0.2.7py3-none-any.whl → 0.3.1py3-none-any.whl