PyPI - glinker - Versions diffs - 0.1.0__py3-none-any.whl - Mend

glinker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

glinker/__init__.py +54 -0
glinker/core/__init__.py +56 -0
glinker/core/base.py +103 -0
glinker/core/builders.py +547 -0
glinker/core/dag.py +898 -0
glinker/core/factory.py +261 -0
glinker/core/registry.py +31 -0
glinker/l0/__init__.py +21 -0
glinker/l0/component.py +472 -0
glinker/l0/models.py +90 -0
glinker/l0/processor.py +108 -0
glinker/l1/__init__.py +15 -0
glinker/l1/component.py +284 -0
glinker/l1/models.py +47 -0
glinker/l1/processor.py +152 -0
glinker/l2/__init__.py +19 -0
glinker/l2/component.py +1220 -0
glinker/l2/models.py +99 -0
glinker/l2/processor.py +170 -0
glinker/l3/__init__.py +12 -0
glinker/l3/component.py +184 -0
glinker/l3/models.py +48 -0
glinker/l3/processor.py +350 -0
glinker/l4/__init__.py +9 -0
glinker/l4/component.py +121 -0
glinker/l4/models.py +21 -0
glinker/l4/processor.py +156 -0
glinker/py.typed +1 -0
glinker-0.1.0.dist-info/METADATA +994 -0
glinker-0.1.0.dist-info/RECORD +33 -0
glinker-0.1.0.dist-info/WHEEL +5 -0
glinker-0.1.0.dist-info/licenses/LICENSE +201 -0
glinker-0.1.0.dist-info/top_level.txt +1 -0

glinker/l2/models.py ADDED Viewed

@@ -0,0 +1,99 @@
+from pydantic import Field, BaseModel
+from typing import List, Dict, Any, Optional, Literal
+from glinker.core.base import BaseConfig, BaseInput, BaseOutput
+class DatabaseRecord(BaseModel):
+    """
+    Unified format for all database layers
+    All layers (Dict, Redis, Elasticsearch, Postgres) use this format.
+    """
+    entity_id: str = Field(..., description="Unique entity identifier")
+    label: str = Field(..., description="Primary label/name")
+    aliases: List[str] = Field(default_factory=list, description="Alternative names")
+    description: str = Field(default="", description="Entity description")
+    entity_type: str = Field(default="", description="Entity type/category")
+    popularity: int = Field(default=0, description="Popularity score")
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Database-specific metadata"
+    )
+    source: str = Field(default="", description="Source layer: dict|redis|elasticsearch|postgres")
+    # Embedding fields for precomputed label embeddings
+    embedding: Optional[List[float]] = Field(
+        default=None,
+        description="Precomputed label embedding vector"
+    )
+    embedding_model_id: Optional[str] = Field(
+        default=None,
+        description="Model ID used to compute the embedding"
+    )
+class FuzzyConfig(BaseConfig):
+    """Fuzzy search configuration"""
+    max_distance: int = Field(2, description="Maximum Levenshtein distance")
+    min_similarity: float = Field(0.3, description="Minimum similarity threshold")
+    n_gram_size: int = Field(3, description="N-gram size for matching")
+    prefix_length: int = Field(1, description="Prefix length to preserve")
+class LayerConfig(BaseConfig):
+    """Database layer configuration"""
+    type: str = Field(..., description="Layer type: dict|redis|elasticsearch|postgres")
+    priority: int = Field(..., description="Search priority (0 = highest)")
+    config: Dict[str, Any] = Field(default_factory=dict, description="Layer-specific config")
+    search_mode: List[Literal["exact", "fuzzy"]] = Field(
+        ["exact"],
+        description="Search methods: ['exact'], ['fuzzy'], or ['exact', 'fuzzy']"
+    )
+    write: bool = Field(True, description="Enable write operations")
+    cache_policy: str = Field("always", description="Cache policy: always|miss|hit")
+    ttl: int = Field(3600, description="TTL in seconds (0 = no expiry)")
+    field_mapping: Dict[str, str] = Field(
+        default_factory=lambda: {
+            "entity_id": "entity_id",
+            "label": "label",
+            "aliases": "aliases",
+            "description": "description",
+            "entity_type": "entity_type",
+            "popularity": "popularity"
+        },
+        description="Field mapping: DatabaseRecord field -> storage field"
+    )
+    fuzzy: Optional[FuzzyConfig] = Field(default_factory=FuzzyConfig, description="Fuzzy search config")
+class EmbeddingConfig(BaseModel):
+    """Configuration for precomputed label embeddings"""
+    enabled: bool = Field(False, description="Enable embedding support")
+    model_name: Optional[str] = Field(None, description="Model name for encoding labels")
+    dim: int = Field(768, description="Embedding dimension")
+    precompute_on_load: bool = Field(False, description="Compute embeddings during load_bulk")
+    batch_size: int = Field(32, description="Batch size for encoding")
+class L2Config(BaseConfig):
+    """L2 processor configuration"""
+    layers: List[LayerConfig] = Field(..., description="Database layers in priority order")
+    max_candidates: int = Field(30, description="Maximum candidates per mention")
+    min_popularity: int = Field(0, description="Minimum popularity threshold")
+    embeddings: Optional[EmbeddingConfig] = Field(
+        default=None,
+        description="Embedding configuration for precomputed labels"
+    )
+class L2Input(BaseInput):
+    """L2 processor input"""
+    mentions: List[str] = Field(..., description="List of mentions to search")
+    structure: List[List[str]] = Field(None, description="Optional grouping structure")
+class L2Output(BaseOutput):
+    """L2 processor output"""
+    candidates: List[List[DatabaseRecord]] = Field(..., description="Candidates per mention/group")

glinker/l2/processor.py ADDED Viewed

@@ -0,0 +1,170 @@
+from typing import Any, List, Union
+from glinker.core.base import BaseProcessor
+from glinker.core.registry import processor_registry
+from .models import L2Config, L2Input, L2Output, DatabaseRecord
+from .component import DatabaseChainComponent
+class L2Processor(BaseProcessor[L2Config, L2Input, L2Output]):
+    """Multi-layer database search processor"""
+    def __init__(
+        self,
+        config: L2Config,
+        component: DatabaseChainComponent,
+        pipeline: list[tuple[str, dict[str, Any]]] = None
+    ):
+        super().__init__(config, component, pipeline)
+        self.schema = {}  # Will be set by DAG executor from node config
+    def format_label(self, record: DatabaseRecord) -> str:
+        """Format label using schema template"""
+        template = self.schema.get('template', '{label}')
+        try:
+            return template.format(**record.model_dump())
+        except KeyError:
+            return record.label
+    def precompute_embeddings(
+        self,
+        encoder_fn,
+        target_layers: List[str] = None,
+        batch_size: int = 32
+    ):
+        """
+        Precompute embeddings for entities using schema template.
+        Args:
+            encoder_fn: Function that takes List[str] and returns embeddings
+            target_layers: Layer types to update
+            batch_size: Batch size for encoding
+        """
+        template = self.schema.get('template', '{label}')
+        model_id = self.config.embeddings.model_name if self.config.embeddings else 'unknown'
+        return self.component.precompute_embeddings(
+            encoder_fn=encoder_fn,
+            template=template,
+            model_id=model_id,
+            target_layers=target_layers,
+            batch_size=batch_size
+        )
+    def _default_pipeline(self) -> list[tuple[str, dict[str, Any]]]:
+        return [
+            ("search", {}),
+            ("filter_by_popularity", {}),
+            ("deduplicate_candidates", {}),
+            ("limit_candidates", {}),
+            ("sort_by_popularity", {})
+        ]
+    def __call__(
+        self,
+        mentions: Union[List[str], List[List[Any]], L2Input] = None,
+        texts: List[str] = None,
+        structure: List[List[str]] = None,
+        input_data: L2Input = None
+    ) -> L2Output:
+        """
+        Process mentions and return candidates
+        Supports:
+        - List[str]: flat list of mention strings
+        - List[List[L1Entity]]: nested list of L1Entity objects (one list per text)
+        - L2Input: structured input with mentions and structure
+        - mentions=None: return entire entity database (one copy per text)
+        """
+        if input_data is not None:
+            mentions = input_data.mentions
+            structure = input_data.structure
+        elif isinstance(mentions, L2Input):
+            structure = mentions.structure
+            mentions = mentions.mentions
+        # No mentions → return entire database (simple pipeline mode)
+        if mentions is None:
+            all_entities = self.component.get_all_entities()
+            n = len(texts) if texts is not None else 1
+            return L2Output(candidates=[all_entities for _ in range(n)])
+        # Check if mentions is nested (list of lists - one per text)
+        if mentions and isinstance(mentions[0], (list, tuple)):
+            # Nested structure: [[entities_text1], [entities_text2], ...]
+            all_candidates = []
+            for text_entities in mentions:
+                text_candidates = []
+                for entity in text_entities:
+                    # Extract text from L1Entity or dict
+                    mention_text = self._extract_mention_text(entity)
+                    # Search candidates for this mention
+                    candidates = self._execute_pipeline(mention_text, self.pipeline)
+                    text_candidates.extend(candidates)
+                all_candidates.append(text_candidates)
+            return L2Output(candidates=all_candidates)
+        # Flat structure: ["mention1", "mention2", ...]
+        else:
+            all_candidates = []
+            for mention in mentions:
+                mention_text = self._extract_mention_text(mention)
+                candidates = self._execute_pipeline(mention_text, self.pipeline)
+                all_candidates.append(candidates)
+            if structure:
+                grouped = self._group_by_structure(all_candidates, structure)
+            else:
+                # Flatten all into one group
+                grouped = [self._flatten(all_candidates)]
+            return L2Output(candidates=grouped)
+    def _extract_mention_text(self, mention: Any) -> str:
+        """Extract text string from mention (can be L1Entity, dict, or str)"""
+        if isinstance(mention, str):
+            return mention
+        elif hasattr(mention, 'text'):
+            return mention.text
+        elif isinstance(mention, dict):
+            return mention.get('text', str(mention))
+        else:
+            return str(mention)
+    def _group_by_structure(
+        self,
+        all_candidates: List[List[DatabaseRecord]],
+        structure: List[List[str]]
+    ) -> List[List[DatabaseRecord]]:
+        """Group candidates according to structure"""
+        grouped = []
+        idx = 0
+        for text_mentions in structure:
+            text_candidates = []
+            for _ in text_mentions:
+                if idx < len(all_candidates):
+                    text_candidates.extend(all_candidates[idx])
+                    idx += 1
+            grouped.append(text_candidates)
+        return grouped
+    def _flatten(self, nested: List[List[Any]]) -> List[Any]:
+        """Flatten nested list"""
+        flat = []
+        for sublist in nested:
+            flat.extend(sublist)
+        return flat
+@processor_registry.register("l2_chain")
+def create_l2_processor(config_dict: dict, pipeline: list = None) -> L2Processor:
+    """Factory: creates component + processor"""
+    config = L2Config(**config_dict)
+    component = DatabaseChainComponent(config)
+    return L2Processor(config, component, pipeline)

glinker/l3/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from .models import L3Config, L3Input, L3Output, L3Entity
+from .component import L3Component
+from .processor import L3Processor
+__all__ = [
+    "L3Config",
+    "L3Input",
+    "L3Output",
+    "L3Entity",
+    "L3Component",
+    "L3Processor",
+]

glinker/l3/component.py ADDED Viewed

@@ -0,0 +1,184 @@
+from typing import Dict, List, Optional
+import torch
+from gliner import GLiNER
+from glinker.core.base import BaseComponent
+from .models import L3Config, L3Entity
+class L3Component(BaseComponent[L3Config]):
+    """GLiNER-based entity linking component"""
+    def _setup(self):
+        """Initialize GLiNER model"""
+        self.model = GLiNER.from_pretrained(
+            self.config.model_name,
+            token=self.config.token,
+            max_length=self.config.max_length
+        )
+        self.model.to(self.config.device)
+        # Fix labels tokenizer max_length for BiEncoder models
+        # Some models have model_max_length not properly set (> 10^18)
+        if (self.config.max_length is not None and
+            hasattr(self.model, 'data_processor') and
+            hasattr(self.model.data_processor, 'labels_tokenizer')):
+            tok = self.model.data_processor.labels_tokenizer
+            if tok.model_max_length > 100000:
+                tok.model_max_length = self.config.max_length
+    @property
+    def device(self):
+        return self.config.device
+    @property
+    def supports_precomputed_embeddings(self) -> bool:
+        """Check if model supports precomputed embeddings (BiEncoder)"""
+        return hasattr(self.model, 'encode_labels') and self.model.config.labels_encoder is not None
+    def get_available_methods(self) -> List[str]:
+        return [
+            "predict_entities",
+            "predict_with_embeddings",
+            "encode_labels",
+            "filter_by_score",
+            "sort_by_position",
+            "deduplicate_entities"
+        ]
+    def encode_labels(self, labels: List[str], batch_size: int = 32) -> torch.Tensor:
+        """
+        Encode labels using GLiNER's native label encoder.
+        Args:
+            labels: List of label strings to encode
+            batch_size: Batch size for encoding
+        Returns:
+            Tensor of shape (num_labels, hidden_size)
+        Raises:
+            NotImplementedError: If model doesn't support label encoding (not BiEncoder)
+        """
+        if not self.supports_precomputed_embeddings:
+            raise NotImplementedError(
+                f"Model {self.config.model_name} doesn't support label precomputation. "
+                "Only BiEncoder models support this feature."
+            )
+        return self.model.encode_labels(labels, batch_size=batch_size)
+    def predict_with_embeddings(
+        self,
+        text: str,
+        labels: List[str],
+        embeddings: torch.Tensor,
+        input_spans: List[List[dict]] = None
+    ) -> List[L3Entity]:
+        """
+        Predict entities using pre-computed label embeddings.
+        Args:
+            text: Input text
+            labels: List of label strings (for output mapping)
+            embeddings: Pre-computed embeddings tensor (num_labels, hidden_size)
+            input_spans: Optional list of span dicts with 'start' and 'end' keys
+                         to constrain prediction to specific spans from L1
+        Returns:
+            List of L3Entity predictions
+        """
+        if not self.supports_precomputed_embeddings:
+            # Fallback to regular prediction
+            return self.predict_entities(text, labels, input_spans=input_spans)
+        kwargs = dict(
+            threshold=self.config.threshold,
+            flat_ner=self.config.flat_ner,
+            multi_label=self.config.multi_label,
+            return_class_probs=True
+        )
+        if input_spans is not None:
+            kwargs["input_spans"] = input_spans
+        entities = self.model.predict_with_embeds(
+            text,
+            embeddings,
+            labels,
+            **kwargs
+        )
+        return [
+            L3Entity(
+                text=e["text"],
+                label=e["label"],
+                start=e["start"],
+                end=e["end"],
+                score=e["score"],
+                class_probs=e.get("class_probs")
+            )
+            for e in entities
+        ]
+    def predict_entities(
+        self,
+        text: str,
+        labels: List[str],
+        input_spans: List[List[dict]] = None
+    ) -> List[L3Entity]:
+        """Predict entities using GLiNER
+        Args:
+            text: Input text
+            labels: List of label strings
+            input_spans: Optional list of span dicts with 'start' and 'end' keys
+                         to constrain prediction to specific spans from L1
+        """
+        if not labels:
+            return []
+        kwargs = dict(
+            threshold=self.config.threshold,
+            flat_ner=self.config.flat_ner,
+            multi_label=self.config.multi_label,
+            return_class_probs=True
+        )
+        if input_spans is not None:
+            kwargs["input_spans"] = input_spans
+        entities = self.model.predict_entities(
+            text,
+            labels,
+            **kwargs
+        )
+        return [
+            L3Entity(
+                text=e["text"],
+                label=e["label"],
+                start=e["start"],
+                end=e["end"],
+                score=e["score"],
+                class_probs=e.get("class_probs")
+            )
+            for e in entities
+        ]
+    def filter_by_score(self, entities: List[L3Entity], threshold: float = None) -> List[L3Entity]:
+        """Filter entities by confidence score"""
+        threshold = threshold if threshold is not None else self.config.threshold
+        return [e for e in entities if e.score >= threshold]
+    def sort_by_position(self, entities: List[L3Entity]) -> List[L3Entity]:
+        """Sort entities by position in text"""
+        return sorted(entities, key=lambda e: e.start)
+    def deduplicate_entities(self, entities: List[L3Entity]) -> List[L3Entity]:
+        """Remove duplicate entities"""
+        seen = set()
+        unique = []
+        for entity in entities:
+            key = (entity.text, entity.start, entity.end)
+            if key not in seen:
+                unique.append(entity)
+                seen.add(key)
+        return unique

glinker/l3/models.py ADDED Viewed

@@ -0,0 +1,48 @@
+from pydantic import Field
+from typing import Dict, List, Any, Optional
+from glinker.core.base import BaseConfig, BaseInput, BaseOutput
+class L3Config(BaseConfig):
+    model_name: str = Field(...)
+    token: str = Field(None)
+    device: str = Field("cpu")
+    threshold: float = Field(0.5)
+    flat_ner: bool = Field(True)
+    multi_label: bool = Field(False)
+    batch_size: int = Field(8)
+    # Embedding settings
+    use_precomputed_embeddings: bool = Field(
+        True,
+        description="Use precomputed embeddings from L2 candidates if available"
+    )
+    cache_embeddings: bool = Field(
+        False,
+        description="Cache computed embeddings back to L2"
+    )
+    max_length: int = Field(
+        None,
+        description="Maximum sequence length for tokenization. Passed to GLiNER.from_pretrained."
+    )
+# TODO replace candidates with labels
+class L3Input(BaseInput):
+    texts: List[str] = Field(...)
+    labels: List[List[Any]] = Field(...)
+class L3Entity(BaseOutput):
+    text: str
+    label: str
+    start: int
+    end: int
+    score: float
+    class_probs: Optional[Dict[str, float]] = Field(
+        None, description="Per-label class probabilities from GLiNER"
+    )
+class L3Output(BaseOutput):
+    entities: List[List[L3Entity]] = Field(...)