PyPI - convmemory - Versions diffs - 0.4.0__py3-none-any.whl - Mend

convmemory 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

convmemory/__init__.py +35 -0
convmemory/api.py +733 -0
convmemory/ccge.py +391 -0
convmemory/encoder.py +150 -0
convmemory/hub.py +45 -0
convmemory/metrics.py +14 -0
convmemory/models.py +31 -0
convmemory/reranker.py +253 -0
convmemory/routing.py +208 -0
convmemory/scoring.py +314 -0
convmemory-0.4.0.dist-info/LICENSE +21 -0
convmemory-0.4.0.dist-info/METADATA +517 -0
convmemory-0.4.0.dist-info/RECORD +15 -0
convmemory-0.4.0.dist-info/WHEEL +5 -0
convmemory-0.4.0.dist-info/top_level.txt +1 -0

convmemory/ccge.py ADDED Viewed

@@ -0,0 +1,391 @@
+"""CCGE-LA conflict-aware candidate-set editor.
+CCGE-LA stands for Low-Amplitude Counterfactual Conflict Graph Editor. It is a
+lightweight editor that runs after ConvMemory and applies a small residual score
+correction when the retrieved candidate set looks conflict-prone.
+The module is intentionally checkpoint-agnostic. Applications can attach a
+trained editor with ``ConvMemory.attach_ccge_editor`` or load one from disk with
+``ConvMemory.load_ccge_editor``.
+"""
+from __future__ import annotations
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Sequence
+import numpy as np
+import torch
+from torch import nn
+from .hub import resolve_checkpoint_path
+from .scoring import lexical_signature
+FEATURE_NAMES = [
+    "base_score_z",
+    "dense_score_z",
+    "position_z",
+    "query_overlap_z",
+    "base_rank_norm",
+    "dense_rank_norm",
+    "sim_to_base_top",
+    "sim_to_dense_top",
+    "semantic_density_top16",
+    "token_overlap_to_top",
+    "newer_than_base_top",
+    "older_than_base_top",
+    "abs_pos_gap_top_z",
+    "base_margin_1_2",
+    "base_entropy_top16",
+    "conflict_density_top16",
+    "time_span_top16",
+    "top_overlap",
+]
+@dataclass(frozen=True)
+class CCGEConfig:
+    """Configuration for the public CCGE-LA editor."""
+    feature_dim: int = len(FEATURE_NAMES)
+    model_dim: int = 96
+    layers: int = 2
+    num_heads: int = 4
+    dropout: float = 0.08
+    gate_bias: float = -2.0
+    residual_init: float = 0.35
+@dataclass(frozen=True)
+class CCGEFeatureBatch:
+    """Feature matrix for one query's candidate set."""
+    candidate_ids: list[str]
+    features: np.ndarray
+def zscore(values: np.ndarray) -> np.ndarray:
+    values = np.asarray(values, dtype=np.float32)
+    if values.size == 0:
+        return values
+    std = float(values.std())
+    if std < 1.0e-6:
+        return values - float(values.mean())
+    return (values - float(values.mean())) / std
+def rank_norm(scores: np.ndarray) -> np.ndarray:
+    order = np.argsort(-scores, kind="mergesort")
+    ranks = np.zeros(len(scores), dtype=np.float32)
+    for rank, idx in enumerate(order):
+        ranks[int(idx)] = rank / max(1, len(scores) - 1)
+    return ranks
+def softmax_entropy(values: np.ndarray) -> float:
+    if values.size <= 1:
+        return 0.0
+    x = np.asarray(values, dtype=np.float32)
+    x = x - float(x.max())
+    p = np.exp(x)
+    p = p / max(float(p.sum()), 1.0e-8)
+    return float(-(p * np.log(p + 1.0e-8)).sum() / np.log(len(p)))
+def normalized_embeddings(embeddings: np.ndarray | None, n: int) -> np.ndarray:
+    if embeddings is None:
+        return np.eye(n, dtype=np.float32)
+    x = np.asarray(embeddings, dtype=np.float32)
+    if x.ndim != 2 or x.shape[0] != n:
+        raise ValueError("candidate_embeddings must have shape [num_candidates, dim]")
+    return x / (np.linalg.norm(x, axis=1, keepdims=True) + 1.0e-8)
+def query_overlap_scores(query: str, candidate_texts: Sequence[str]) -> np.ndarray:
+    """Lexical overlap scores for query and candidate memories."""
+    query_set, _ = lexical_signature(query)
+    values = []
+    for text in candidate_texts:
+        memory_set, _ = lexical_signature(str(text))
+        values.append(len(query_set & memory_set) / max(1, len(query_set)))
+    return np.asarray(values, dtype=np.float32)
+def token_overlap_to_text(candidate_texts: Sequence[str], top_index: int) -> np.ndarray:
+    """Token overlap between each candidate and the selected top candidate."""
+    top_set, _ = lexical_signature(str(candidate_texts[int(top_index)]))
+    values = []
+    for text in candidate_texts:
+        memory_set, _ = lexical_signature(str(text))
+        union = top_set | memory_set
+        values.append(len(top_set & memory_set) / max(1, len(union)))
+    return np.asarray(values, dtype=np.float32)
+def build_ccge_features(
+    *,
+    candidate_ids: Sequence[str],
+    convmemory_scores: Sequence[float],
+    dense_scores: Sequence[float] | None = None,
+    positions: Sequence[float] | None = None,
+    candidate_embeddings: np.ndarray | None = None,
+    query_overlaps: Sequence[float] | None = None,
+    query: str | None = None,
+    candidate_texts: Sequence[str] | None = None,
+    top_k_density: int = 16,
+) -> CCGEFeatureBatch:
+    """Build CCGE-LA candidate-set features.
+    The features describe the retrieved candidate set. They do not encode
+    gold/current/stale labels and are safe to compute at inference time.
+    """
+    ids = [str(x) for x in candidate_ids]
+    n = len(ids)
+    if n == 0:
+        raise ValueError("candidate_ids must not be empty")
+    base = np.asarray(convmemory_scores, dtype=np.float32)
+    if base.shape[0] != n:
+        raise ValueError("convmemory_scores must match candidate_ids")
+    dense = np.asarray(dense_scores if dense_scores is not None else base, dtype=np.float32)
+    pos = np.asarray(positions if positions is not None else np.arange(n), dtype=np.float32)
+    if query_overlaps is not None:
+        overlap = np.asarray(query_overlaps, dtype=np.float32)
+    elif query is not None and candidate_texts is not None:
+        overlap = query_overlap_scores(query, candidate_texts)
+    else:
+        overlap = np.zeros(n, dtype=np.float32)
+    if dense.shape[0] != n or pos.shape[0] != n or overlap.shape[0] != n:
+        raise ValueError("dense_scores, positions, and query_overlaps must match candidate_ids")
+    emb = normalized_embeddings(candidate_embeddings, n)
+    base_order = np.argsort(-base, kind="mergesort")
+    dense_order = np.argsort(-dense, kind="mergesort")
+    top_base = int(base_order[0])
+    top_dense = int(dense_order[0])
+    topk = base_order[: min(top_k_density, n)]
+    sim_to_base_top = emb @ emb[top_base]
+    sim_to_dense_top = emb @ emb[top_dense]
+    density = (emb @ emb[topk].T).mean(axis=1) if len(topk) else np.zeros(n, dtype=np.float32)
+    if candidate_texts is not None:
+        overlap_to_top = token_overlap_to_text(candidate_texts, top_base)
+    else:
+        overlap_to_top = np.full(n, float(overlap[top_base]), dtype=np.float32)
+    pos_gap = np.abs(pos - pos[top_base])
+    sorted_base_z = np.sort(zscore(base))[::-1]
+    margin = float(sorted_base_z[0] - sorted_base_z[1]) if len(sorted_base_z) > 1 else 0.0
+    entropy = softmax_entropy(zscore(base)[topk])
+    conflict_density = (
+        float(np.mean((sim_to_base_top[topk] > 0.45) & (np.abs(pos[topk] - pos[top_base]) > 0)))
+        if len(topk)
+        else 0.0
+    )
+    span = float(pos[topk].max() - pos[topk].min()) if len(topk) else 0.0
+    full_span = max(1.0, float(pos.max() - pos.min()))
+    top_overlap = float(overlap[top_base])
+    features = np.stack(
+        [
+            zscore(base),
+            zscore(dense),
+            zscore(pos),
+            zscore(overlap),
+            rank_norm(base),
+            rank_norm(dense),
+            sim_to_base_top.astype(np.float32),
+            sim_to_dense_top.astype(np.float32),
+            density.astype(np.float32),
+            overlap_to_top.astype(np.float32),
+            (pos > pos[top_base]).astype(np.float32),
+            (pos < pos[top_base]).astype(np.float32),
+            zscore(pos_gap),
+            np.full(n, margin, dtype=np.float32),
+            np.full(n, entropy, dtype=np.float32),
+            np.full(n, conflict_density, dtype=np.float32),
+            np.full(n, span / full_span, dtype=np.float32),
+            np.full(n, top_overlap, dtype=np.float32),
+        ],
+        axis=1,
+    ).astype(np.float32)
+    return CCGEFeatureBatch(candidate_ids=ids, features=features)
+class CCGELowAmplitudeEditor(nn.Module):
+    """Low-amplitude residual editor over ConvMemory candidate scores."""
+    def __init__(
+        self,
+        feature_dim: int = len(FEATURE_NAMES),
+        *,
+        model_dim: int = 96,
+        layers: int = 2,
+        num_heads: int = 4,
+        dropout: float = 0.08,
+        gate_bias: float = -2.0,
+        residual_init: float = 0.35,
+    ):
+        super().__init__()
+        if model_dim % num_heads != 0:
+            raise ValueError("model_dim must be divisible by num_heads")
+        self.config = CCGEConfig(
+            feature_dim=int(feature_dim),
+            model_dim=int(model_dim),
+            layers=int(layers),
+            num_heads=int(num_heads),
+            dropout=float(dropout),
+            gate_bias=float(gate_bias),
+            residual_init=float(residual_init),
+        )
+        self.trained_embedding_model_name = None
+        self.in_proj = nn.Sequential(
+            nn.Linear(feature_dim, model_dim),
+            nn.GELU(),
+            nn.LayerNorm(model_dim),
+        )
+        enc = nn.TransformerEncoderLayer(
+            d_model=model_dim,
+            nhead=num_heads,
+            dim_feedforward=model_dim * 3,
+            dropout=dropout,
+            activation="gelu",
+            batch_first=True,
+            norm_first=True,
+        )
+        self.encoder = nn.TransformerEncoder(enc, num_layers=layers)
+        self.residual = nn.Sequential(
+            nn.Linear(model_dim, model_dim),
+            nn.GELU(),
+            nn.Dropout(0.05),
+            nn.Linear(model_dim, 1),
+        )
+        self.gate = nn.Sequential(nn.Linear(model_dim + 7, 64), nn.GELU(), nn.Linear(64, 1))
+        self.residual_scale = nn.Parameter(torch.tensor(float(residual_init)))
+        nn.init.zeros_(self.gate[-1].weight)
+        nn.init.constant_(self.gate[-1].bias, gate_bias)
+    def forward(self, features: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        base = features[..., 0]
+        h = self.encoder(self.in_proj(features))
+        residual = self.residual(h).squeeze(-1)
+        pooled = h.mean(dim=1)
+        state = torch.stack(
+            [
+                features[..., 13].mean(dim=1),
+                features[..., 14].mean(dim=1),
+                features[..., 15].mean(dim=1),
+                features[..., 16].mean(dim=1),
+                features[..., 17].mean(dim=1),
+                (features[..., 4] < 0.05).float().mean(dim=1),
+                features[..., 8].max(dim=1).values,
+            ],
+            dim=-1,
+        )
+        gate = torch.sigmoid(self.gate(torch.cat([pooled, state], dim=-1))).squeeze(-1)
+        scale = torch.clamp(self.residual_scale, 0.05, 2.0)
+        scores = base + gate.unsqueeze(-1) * scale * residual
+        return scores, gate
+    @torch.no_grad()
+    def edit_batch(
+        self,
+        batch: CCGEFeatureBatch,
+        *,
+        device: str | torch.device | None = None,
+    ) -> tuple[np.ndarray, float]:
+        """Return edited scores and the query-level gate for one feature batch."""
+        if device is None:
+            device = next(self.parameters()).device
+        self.eval()
+        x = torch.tensor(batch.features, dtype=torch.float32, device=device).unsqueeze(0)
+        scores, gate = self.to(device)(x)
+        return scores.detach().cpu().numpy()[0], float(gate.detach().cpu().numpy()[0])
+    def save_pretrained(self, path: str | Path) -> None:
+        """Save a CCGE-LA editor checkpoint."""
+        path = Path(path)
+        if path.suffix:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            target = path
+        else:
+            path.mkdir(parents=True, exist_ok=True)
+            target = path / "ccge_la.pt"
+        torch.save(
+            {
+                "format": "convmemory-ccge-la",
+                "version": 1,
+                "config": asdict(self.config),
+                "state_dict": self.state_dict(),
+                "trained_embedding_model_name": getattr(
+                    self,
+                    "trained_embedding_model_name",
+                    None,
+                ),
+            },
+            target,
+        )
+    @classmethod
+    def from_pretrained(
+        cls,
+        path: str | Path,
+        *,
+        device: str | torch.device = "cpu",
+        strict: bool = True,
+    ) -> "CCGELowAmplitudeEditor":
+        """Load a CCGE-LA editor checkpoint from disk or Hugging Face Hub."""
+        path = resolve_checkpoint_path(path)
+        source = path / "ccge_la.pt" if path.is_dir() else path
+        payload = torch.load(source, map_location="cpu")
+        config = payload.get("config", {})
+        model = cls(**config)
+        state_dict = payload.get("state_dict", payload)
+        model.load_state_dict(state_dict, strict=strict)
+        model.trained_embedding_model_name = payload.get("trained_embedding_model_name")
+        return model.to(device).eval()
+def multi_positive_retrieval_loss(scores: torch.Tensor, gold_mask: torch.Tensor) -> torch.Tensor:
+    """Retrieval cross-entropy for one or more positive candidates."""
+    all_lse = torch.logsumexp(scores, dim=-1)
+    masked = scores.masked_fill(~gold_mask, -1.0e9)
+    gold_lse = torch.logsumexp(masked, dim=-1)
+    return -(gold_lse - all_lse).mean()
+@torch.no_grad()
+def rank_candidates(
+    editor: CCGELowAmplitudeEditor,
+    batch: CCGEFeatureBatch,
+    *,
+    device: str | torch.device = "cpu",
+) -> list[tuple[str, float]]:
+    """Return candidate ids sorted by edited CCGE-LA score."""
+    values, _ = editor.edit_batch(batch, device=device)
+    order = np.argsort(-values, kind="mergesort")
+    return [(batch.candidate_ids[int(i)], float(values[int(i)])) for i in order]
+__all__ = [
+    "FEATURE_NAMES",
+    "CCGEConfig",
+    "CCGEFeatureBatch",
+    "CCGELowAmplitudeEditor",
+    "build_ccge_features",
+    "multi_positive_retrieval_loss",
+    "query_overlap_scores",
+    "rank_candidates",
+    "token_overlap_to_text",
+]

convmemory/encoder.py ADDED Viewed

@@ -0,0 +1,150 @@
+import torch
+class MixerConvMemoryEncoder(torch.nn.Module):
+    """Lightweight temporal encoder over a short memory window.
+    The input shape is `[batch, window, embedding_dim]`. The query embedding is
+    used both for feature construction and query-aware pooling.
+    """
+    def __init__(
+        self,
+        dim,
+        window_size=5,
+        kernel_size=3,
+        hidden_dim=256,
+        token_mlp_dim=32,
+        channel_mlp_dim=512,
+        type_vocab_size=0,
+        output_mode="residual",
+        output_gate_init=0.1,
+        score_mode="cosine",
+        score_gate_init=0.1,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.output_mode = output_mode
+        self.score_mode = score_mode
+        self.type_embedding = None
+        if type_vocab_size:
+            self.type_embedding = torch.nn.Embedding(type_vocab_size, dim)
+        self.input_proj = torch.nn.Sequential(
+            torch.nn.Linear(dim * 3, hidden_dim),
+            torch.nn.GELU(),
+            torch.nn.LayerNorm(hidden_dim),
+        )
+        self.conv_norm = torch.nn.LayerNorm(hidden_dim)
+        self.depthwise_conv = torch.nn.Conv1d(
+            hidden_dim,
+            hidden_dim,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            groups=hidden_dim,
+        )
+        self.pointwise = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.conv_gate = torch.nn.Parameter(torch.tensor(0.1))
+        self.token_norm = torch.nn.LayerNorm(window_size)
+        self.token_mlp = torch.nn.Sequential(
+            torch.nn.Linear(window_size, token_mlp_dim),
+            torch.nn.GELU(),
+            torch.nn.Linear(token_mlp_dim, window_size),
+        )
+        self.token_gate = torch.nn.Parameter(torch.tensor(0.1))
+        self.channel_norm = torch.nn.LayerNorm(hidden_dim)
+        self.channel_mlp = torch.nn.Sequential(
+            torch.nn.Linear(hidden_dim, channel_mlp_dim),
+            torch.nn.GELU(),
+            torch.nn.Linear(channel_mlp_dim, hidden_dim),
+        )
+        self.channel_gate = torch.nn.Parameter(torch.tensor(0.1))
+        self.query_proj = torch.nn.Linear(dim, hidden_dim)
+        self.attn_x = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
+        self.attn_q = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
+        self.attn_v = torch.nn.Linear(hidden_dim, 1, bias=False)
+        self.output_head = torch.nn.Sequential(
+            torch.nn.Linear(hidden_dim * 4, dim),
+            torch.nn.LayerNorm(dim),
+        )
+        self.output_gate = torch.nn.Parameter(torch.tensor(float(output_gate_init)))
+        self.score_head = torch.nn.Sequential(
+            torch.nn.Linear(dim * 4, hidden_dim),
+            torch.nn.GELU(),
+            torch.nn.Linear(hidden_dim, 1),
+        )
+        self.score_gate = torch.nn.Parameter(torch.tensor(float(score_gate_init)))
+    def _token_mix(self, h):
+        length = h.shape[1]
+        if length < self.window_size:
+            pad = torch.zeros(
+                h.shape[0],
+                self.window_size - length,
+                h.shape[2],
+                dtype=h.dtype,
+                device=h.device,
+            )
+            h_for_mix = torch.cat([h, pad], dim=1)
+        else:
+            h_for_mix = h[:, : self.window_size]
+        mixed = h_for_mix.transpose(1, 2)
+        mixed = self.token_mlp(self.token_norm(mixed)).transpose(1, 2)
+        return mixed[:, :length]
+    def forward(self, x, query=None, type_ids=None):
+        base_x = x
+        if self.type_embedding is not None and type_ids is not None:
+            x = x + self.type_embedding(type_ids)
+            base_x = x
+        if query is None:
+            query = x.mean(dim=1)
+        query_norm = torch.nn.functional.normalize(query, dim=-1)
+        base_norm = torch.nn.functional.normalize(base_x, dim=-1)
+        base_scores = (base_norm * query_norm[:, None, :]).sum(dim=-1)
+        base_weights = torch.softmax(base_scores, dim=1)
+        base = (base_x * base_weights[:, :, None]).sum(dim=1)
+        query_per_turn = query[:, None, :].expand(-1, x.shape[1], -1)
+        features = torch.cat([x, x * query_per_turn, torch.abs(x - query_per_turn)], dim=-1)
+        h = self.input_proj(features)
+        conv_in = self.conv_norm(h).transpose(1, 2)
+        conv_out = self.depthwise_conv(conv_in).transpose(1, 2)
+        h = h + self.conv_gate * self.pointwise(torch.nn.functional.gelu(conv_out))
+        h = h + self.token_gate * self._token_mix(h)
+        h = h + self.channel_gate * self.channel_mlp(self.channel_norm(h))
+        qh = self.query_proj(query)
+        attn = self.attn_v(torch.tanh(self.attn_x(h) + self.attn_q(qh)[:, None, :])).squeeze(-1)
+        weights = torch.softmax(attn, dim=1)
+        pooled = (h * weights[:, :, None]).sum(dim=1)
+        out = self.output_head(
+            torch.cat([pooled, qh, pooled * qh, torch.abs(pooled - qh)], dim=-1)
+        )
+        if self.output_mode == "residual":
+            out = base + self.output_gate * out
+        return torch.nn.functional.normalize(out, dim=-1)
+    def score_windows(self, x, query=None, type_ids=None):
+        vectors = self.forward(x, query=query, type_ids=type_ids)
+        if query is None:
+            query = x.mean(dim=1)
+        query_norm = torch.nn.functional.normalize(query, dim=-1)
+        cosine = (vectors * query_norm).sum(dim=-1)
+        if self.score_mode == "cosine":
+            return cosine
+        features = torch.cat(
+            [vectors, query_norm, vectors * query_norm, torch.abs(vectors - query_norm)],
+            dim=-1,
+        )
+        correction = torch.tanh(self.score_head(features).squeeze(-1))
+        return cosine + self.score_gate * correction

convmemory/hub.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Optional Hugging Face Hub path resolution helpers."""
+from __future__ import annotations
+from pathlib import Path
+try:
+    from huggingface_hub import snapshot_download as _hf_snapshot_download
+except Exception:  # pragma: no cover - exercised when optional dep is absent
+    _hf_snapshot_download = None
+def looks_like_hub_id(path: str | Path) -> bool:
+    """Return whether a missing path looks like a `namespace/repo` Hub id."""
+    text = str(path).replace("\\", "/").strip()
+    if not text or "://" in text or ":" in text:
+        return False
+    if text.startswith(("/", "./", "../", "~")):
+        return False
+    parts = text.split("/")
+    return len(parts) == 2 and all(parts)
+def resolve_checkpoint_path(path: str | Path, *, repo_type: str = "model") -> Path:
+    """Resolve a local checkpoint path or download a Hugging Face Hub repo id."""
+    candidate = Path(path)
+    if candidate.exists():
+        return candidate
+    if not looks_like_hub_id(path):
+        return candidate
+    if _hf_snapshot_download is None:
+        raise ValueError(
+            "Checkpoint path does not exist and looks like a Hugging Face Hub "
+            "repo id, but `huggingface_hub` is not installed. Install it with "
+            "`pip install huggingface_hub` or pass a local checkpoint path."
+        )
+    try:
+        return Path(_hf_snapshot_download(repo_id=str(path), repo_type=repo_type))
+    except Exception as exc:
+        raise ValueError(
+            f"Could not download Hugging Face Hub checkpoint repo '{path}'. "
+            "Pass a local checkpoint path or verify repo access."
+        ) from exc

convmemory/metrics.py ADDED Viewed

@@ -0,0 +1,14 @@
+def recall_at_k(ranked_ids, gold_ids, k):
+    return len(set(ranked_ids[:k]) & set(gold_ids)) / max(1, len(gold_ids))
+def hit_at_k(ranked_ids, gold_ids, k):
+    return float(bool(set(ranked_ids[:k]) & set(gold_ids)))
+def mrr(ranked_ids, gold_ids):
+    gold = set(gold_ids)
+    for rank, item_id in enumerate(ranked_ids, start=1):
+        if item_id in gold:
+            return 1.0 / rank
+    return 0.0

convmemory/models.py ADDED Viewed

@@ -0,0 +1,31 @@
+from .encoder import MixerConvMemoryEncoder
+from .scoring import CELiteScorer
+def build_default_components(
+    embedding_dim,
+    window_size=5,
+    kernel_size=3,
+    hidden_dim=256,
+    token_mlp_dim=32,
+    channel_mlp_dim=512,
+    extra_scalar_features=5,
+    device="cpu",
+):
+    conv_model = MixerConvMemoryEncoder(
+        embedding_dim,
+        window_size=window_size,
+        kernel_size=kernel_size,
+        hidden_dim=hidden_dim,
+        token_mlp_dim=token_mlp_dim,
+        channel_mlp_dim=channel_mlp_dim,
+        output_mode="residual",
+        output_gate_init=0.1,
+        score_mode="cosine",
+    ).to(device)
+    scorer = CELiteScorer(
+        embedding_dim,
+        hidden_dim=hidden_dim,
+        extra_scalar_features=extra_scalar_features,
+    ).to(device)
+    return conv_model, scorer