PyPI - glinker - Versions diffs - 0.1.0__py3-none-any.whl - Mend

glinker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

glinker/__init__.py +54 -0
glinker/core/__init__.py +56 -0
glinker/core/base.py +103 -0
glinker/core/builders.py +547 -0
glinker/core/dag.py +898 -0
glinker/core/factory.py +261 -0
glinker/core/registry.py +31 -0
glinker/l0/__init__.py +21 -0
glinker/l0/component.py +472 -0
glinker/l0/models.py +90 -0
glinker/l0/processor.py +108 -0
glinker/l1/__init__.py +15 -0
glinker/l1/component.py +284 -0
glinker/l1/models.py +47 -0
glinker/l1/processor.py +152 -0
glinker/l2/__init__.py +19 -0
glinker/l2/component.py +1220 -0
glinker/l2/models.py +99 -0
glinker/l2/processor.py +170 -0
glinker/l3/__init__.py +12 -0
glinker/l3/component.py +184 -0
glinker/l3/models.py +48 -0
glinker/l3/processor.py +350 -0
glinker/l4/__init__.py +9 -0
glinker/l4/component.py +121 -0
glinker/l4/models.py +21 -0
glinker/l4/processor.py +156 -0
glinker/py.typed +1 -0
glinker-0.1.0.dist-info/METADATA +994 -0
glinker-0.1.0.dist-info/RECORD +33 -0
glinker-0.1.0.dist-info/WHEEL +5 -0
glinker-0.1.0.dist-info/licenses/LICENSE +201 -0
glinker-0.1.0.dist-info/top_level.txt +1 -0

glinker/l1/component.py ADDED Viewed

@@ -0,0 +1,284 @@
+import spacy
+from spacy.language import Language
+from typing import List
+import torch
+from glinker.core.base import BaseComponent
+from .models import L1Config, L1GlinerConfig, L1Entity
+class L1SpacyComponent(BaseComponent[L1Config]):
+    """spaCy-based entity extraction component"""
+    def _setup(self):
+        """Initialize spaCy model"""
+        self.nlp = self._load_model()
+    def _load_model(self) -> Language:
+        """Load or download spaCy model"""
+        try:
+            nlp = spacy.load(self.config.model)
+            if self.config.device != "cpu":
+                spacy.require_gpu()
+            return nlp
+        except OSError:
+            from spacy.cli import download
+            download(self.config.model)
+            return spacy.load(self.config.model)
+    def get_available_methods(self) -> list[str]:
+        """Return list of available pipeline methods"""
+        return [
+            "extract_entities",
+            "filter_by_length",
+            "deduplicate",
+            "sort_by_position",
+            "add_noun_chunks"
+        ]
+    def extract_entities(self, text: str) -> list[L1Entity]:
+        """Extract named entities from text"""
+        doc = self.nlp(text)
+        entities = []
+        seen_spans = set()
+        for ent in doc.ents:
+            span = (ent.start_char, ent.end_char)
+            if span in seen_spans:
+                continue
+            left_context, right_context = self._get_context(
+                text, ent.start_char, ent.end_char
+            )
+            entities.append(L1Entity(
+                text=ent.text,
+                start=ent.start_char,
+                end=ent.end_char,
+                left_context=left_context,
+                right_context=right_context
+            ))
+            seen_spans.add(span)
+        return entities
+    def filter_by_length(
+        self,
+        entities: list[L1Entity],
+        min_length: int = None
+    ) -> list[L1Entity]:
+        """Filter entities by minimum text length"""
+        min_len = min_length if min_length is not None else self.config.min_entity_length
+        return [e for e in entities if len(e.text) >= min_len]
+    def deduplicate(self, entities: list[L1Entity]) -> list[L1Entity]:
+        """Remove duplicate entities by span"""
+        seen_spans = set()
+        unique = []
+        for entity in entities:
+            span = (entity.start, entity.end)
+            if span not in seen_spans:
+                unique.append(entity)
+                seen_spans.add(span)
+        return unique
+    def sort_by_position(self, entities: list[L1Entity]) -> list[L1Entity]:
+        """Sort entities by start position"""
+        return sorted(entities, key=lambda x: x.start)
+    def add_noun_chunks(
+        self,
+        text: str,
+        entities: list[L1Entity] = None
+    ) -> list[L1Entity]:
+        """Add noun chunks to entities list"""
+        if entities is None:
+            entities = []
+        doc = self.nlp(text)
+        seen_spans = {(e.start, e.end) for e in entities}
+        for chunk in doc.noun_chunks:
+            span = (chunk.start_char, chunk.end_char)
+            overlap = False
+            for (s, e) in seen_spans:
+                if not (chunk.end_char <= s or chunk.start_char >= e):
+                    overlap = True
+                    break
+            if not overlap and len(chunk.text) >= self.config.min_entity_length:
+                left_context, right_context = self._get_context(
+                    text, chunk.start_char, chunk.end_char
+                )
+                entities.append(L1Entity(
+                    text=chunk.text,
+                    start=chunk.start_char,
+                    end=chunk.end_char,
+                    left_context=left_context,
+                    right_context=right_context
+                ))
+                seen_spans.add(span)
+        return entities
+    def _get_context(self, text: str, start: int, end: int) -> tuple[str, str]:
+        """Extract left and right context for entity"""
+        left_start = max(0, start - self.config.max_left_context)
+        left_context = text[left_start:start].strip()
+        right_end = min(len(text), end + self.config.max_right_context)
+        right_context = text[end:right_end].strip()
+        return left_context, right_context
+class L1GlinerComponent(BaseComponent[L1GlinerConfig]):
+    """GLiNER-based entity extraction component for L1"""
+    def _setup(self):
+        """Initialize GLiNER model"""
+        from gliner import GLiNER
+        self.model = GLiNER.from_pretrained(
+            self.config.model,
+            token=self.config.token,
+            max_length=self.config.max_length
+        )
+        self.model.to(self.config.device)
+        # Fix labels tokenizer max_length for BiEncoder models
+        if (self.config.max_length is not None and
+            hasattr(self.model, 'data_processor') and
+            hasattr(self.model.data_processor, 'labels_tokenizer')):
+            tok = self.model.data_processor.labels_tokenizer
+            if tok.model_max_length > 100000:
+                tok.model_max_length = self.config.max_length
+        # Precompute label embeddings if requested and model supports it
+        self._label_embeddings = None
+        if self.config.use_precomputed_embeddings and self.supports_precomputed_embeddings:
+            self._label_embeddings = self.encode_labels(self.config.labels)
+    @property
+    def supports_precomputed_embeddings(self) -> bool:
+        """Check if model supports precomputed embeddings (BiEncoder)"""
+        return hasattr(self.model, 'encode_labels') and self.model.config.labels_encoder is not None
+    def get_available_methods(self) -> List[str]:
+        """Return list of available pipeline methods"""
+        return [
+            "extract_entities",
+            "filter_by_length",
+            "deduplicate",
+            "sort_by_position",
+            "encode_labels"
+        ]
+    def encode_labels(self, labels: List[str], batch_size: int = None) -> torch.Tensor:
+        """
+        Encode labels using GLiNER's native label encoder.
+        Args:
+            labels: List of label strings to encode
+            batch_size: Batch size for encoding (defaults to config.batch_size)
+        Returns:
+            Tensor of shape (num_labels, hidden_size)
+        Raises:
+            NotImplementedError: If model doesn't support label encoding
+        """
+        if not self.supports_precomputed_embeddings:
+            raise NotImplementedError(
+                f"Model {self.config.model} doesn't support label precomputation. "
+                "Only BiEncoder models support this feature."
+            )
+        batch_size = batch_size or self.config.batch_size
+        return self.model.encode_labels(labels, batch_size=batch_size)
+    def extract_entities(self, text: str) -> List[L1Entity]:
+        """Extract named entities from text using GLiNER"""
+        if not self.config.labels:
+            return []
+        # Use precomputed embeddings if available
+        if self._label_embeddings is not None:
+            raw_entities = self.model.predict_with_embeds(
+                text,
+                self._label_embeddings,
+                self.config.labels,
+                threshold=self.config.threshold,
+                flat_ner=self.config.flat_ner,
+                multi_label=self.config.multi_label
+            )
+        else:
+            raw_entities = self.model.predict_entities(
+                text,
+                self.config.labels,
+                threshold=self.config.threshold,
+                flat_ner=self.config.flat_ner,
+                multi_label=self.config.multi_label
+            )
+        entities = []
+        seen_spans = set()
+        for ent in raw_entities:
+            span = (ent["start"], ent["end"])
+            if span in seen_spans:
+                continue
+            left_context, right_context = self._get_context(
+                text, ent["start"], ent["end"]
+            )
+            entities.append(L1Entity(
+                text=ent["text"],
+                start=ent["start"],
+                end=ent["end"],
+                left_context=left_context,
+                right_context=right_context
+            ))
+            seen_spans.add(span)
+        return entities
+    def filter_by_length(
+        self,
+        entities: List[L1Entity],
+        min_length: int = None
+    ) -> List[L1Entity]:
+        """Filter entities by minimum text length"""
+        min_len = min_length if min_length is not None else self.config.min_entity_length
+        return [e for e in entities if len(e.text) >= min_len]
+    def deduplicate(self, entities: List[L1Entity]) -> List[L1Entity]:
+        """Remove duplicate entities by span"""
+        seen_spans = set()
+        unique = []
+        for entity in entities:
+            span = (entity.start, entity.end)
+            if span not in seen_spans:
+                unique.append(entity)
+                seen_spans.add(span)
+        return unique
+    def sort_by_position(self, entities: List[L1Entity]) -> List[L1Entity]:
+        """Sort entities by start position"""
+        return sorted(entities, key=lambda x: x.start)
+    def _get_context(self, text: str, start: int, end: int) -> tuple[str, str]:
+        """Extract left and right context for entity"""
+        left_start = max(0, start - self.config.max_left_context)
+        left_context = text[left_start:start].strip()
+        right_end = min(len(text), end + self.config.max_right_context)
+        right_context = text[end:right_end].strip()
+        return left_context, right_context

glinker/l1/models.py ADDED Viewed

@@ -0,0 +1,47 @@
+from pydantic import Field
+from typing import List, Optional
+from glinker.core.base import BaseConfig, BaseInput, BaseOutput
+class L1Config(BaseConfig):
+    model: str = Field("en_core_sci_sm", description="spaCy model identifier")
+    device: str = Field("cpu", description="Device to run the model on")
+    batch_size: int = Field(16, description="Batch size for processing")
+    max_right_context: int = Field(50, description="Maximum right context length")
+    max_left_context: int = Field(50, description="Maximum left context length")
+    min_entity_length: int = Field(2, description="Minimum entity text length")
+    include_noun_chunks: bool = Field(False, description="Include noun chunks")
+class L1GlinerConfig(L1Config):
+    """Configuration for GLiNER-based L1 entity extraction"""
+    model: str = Field(..., description="GLiNER model identifier (overrides spaCy model)")
+    labels: List[str] = Field(..., description="Fixed list of labels for entity extraction")
+    token: Optional[str] = Field(None, description="HuggingFace token")
+    threshold: float = Field(0.3, description="Confidence threshold for entity extraction")
+    flat_ner: bool = Field(True, description="Use flat NER (no nested entities)")
+    multi_label: bool = Field(False, description="Allow multiple labels per entity")
+    use_precomputed_embeddings: bool = Field(
+        False,
+        description="Use precomputed label embeddings (BiEncoder only)"
+    )
+    max_length: Optional[int] = Field(
+        None,
+        description="Maximum sequence length for tokenization"
+    )
+class L1Input(BaseInput):
+    texts: list[str] = Field(..., description="List of text inputs")
+class L1Entity(BaseOutput):
+    text: str = Field(..., description="Extracted mention text")
+    start: int = Field(..., description="Start position")
+    end: int = Field(..., description="End position")
+    left_context: str = Field(..., description="Left context")
+    right_context: str = Field(..., description="Right context")
+class L1Output(BaseOutput):
+    entities: list[list[L1Entity]] = Field(..., description="Extracted entities per text")

glinker/l1/processor.py ADDED Viewed

@@ -0,0 +1,152 @@
+from typing import Any, List, Union
+from glinker.core.base import BaseProcessor
+from glinker.core.registry import processor_registry
+from .models import L1Config, L1GlinerConfig, L1Input, L1Output
+from .component import L1SpacyComponent, L1GlinerComponent
+class L1SpacyProcessor(BaseProcessor[L1Config, L1Input, L1Output]):
+    """Optimized batch processor using spaCy pipe"""
+    def __init__(
+        self,
+        config: L1Config,
+        component: L1SpacyComponent,
+        pipeline: list[tuple[str, dict[str, Any]]] = None
+    ):
+        super().__init__(config, component, pipeline)
+        self._validate_pipeline()
+    def _default_pipeline(self) -> list[tuple[str, dict[str, Any]]]:
+        return [
+            ("extract_entities", {}),
+            ("deduplicate", {}),
+            ("sort_by_position", {})
+        ]
+    def __call__(
+        self,
+        texts: List[str] = None,
+        input_data: L1Input = None
+    ) -> L1Output:
+        """Process batch using spaCy's efficient pipe"""
+        # Support both direct texts and L1Input
+        if texts is not None:
+            texts_to_process = texts
+        elif input_data is not None:
+            texts_to_process = input_data.texts
+        else:
+            raise ValueError("Either 'texts' or 'input_data' must be provided")
+        results = []
+        for doc, original_text in zip(
+            self.component.nlp.pipe(
+                texts_to_process,
+                batch_size=self.config.batch_size
+            ),
+            texts_to_process
+        ):
+            entities = self._extract_from_doc(doc, original_text)
+            pipeline_rest = [
+                (method, kwargs)
+                for method, kwargs in self.pipeline
+                if method != "extract_entities"
+            ]
+            entities = self._execute_pipeline(entities, pipeline_rest)
+            results.append(entities)
+        return L1Output(entities=results)
+    def _extract_from_doc(self, doc, text: str) -> list:
+        """Extract entities from already processed doc"""
+        from .models import L1Entity
+        entities = []
+        for ent in doc.ents:
+            left_context, right_context = self.component._get_context(
+                text, ent.start_char, ent.end_char
+            )
+            entities.append(L1Entity(
+                text=ent.text,
+                start=ent.start_char,
+                end=ent.end_char,
+                left_context=left_context,
+                right_context=right_context
+            ))
+        return entities
+@processor_registry.register("l1_spacy")
+def create_l1_spacy_processor(config_dict: dict, pipeline: list = None) -> L1SpacyProcessor:
+    """Factory: creates component + batch processor"""
+    config = L1Config(**config_dict)
+    component = L1SpacyComponent(config)
+    return L1SpacyProcessor(config, component, pipeline)
+class L1GlinerProcessor(BaseProcessor[L1GlinerConfig, L1Input, L1Output]):
+    """GLiNER-based batch processor for L1 entity extraction"""
+    def __init__(
+        self,
+        config: L1GlinerConfig,
+        component: L1GlinerComponent,
+        pipeline: list[tuple[str, dict[str, Any]]] = None
+    ):
+        super().__init__(config, component, pipeline)
+        self._validate_pipeline()
+    def _default_pipeline(self) -> list[tuple[str, dict[str, Any]]]:
+        return [
+            ("extract_entities", {}),
+            ("deduplicate", {}),
+            ("sort_by_position", {})
+        ]
+    def __call__(
+        self,
+        texts: List[str] = None,
+        input_data: L1Input = None
+    ) -> L1Output:
+        """Process batch of texts using GLiNER"""
+        # Support both direct texts and L1Input
+        if texts is not None:
+            texts_to_process = texts
+        elif input_data is not None:
+            texts_to_process = input_data.texts
+        else:
+            raise ValueError("Either 'texts' or 'input_data' must be provided")
+        results = []
+        # Process each text individually
+        for text in texts_to_process:
+            # Extract entities using component
+            entities = self.component.extract_entities(text)
+            # Apply rest of pipeline (skip extract_entities as already done)
+            pipeline_rest = [
+                (method, kwargs)
+                for method, kwargs in self.pipeline
+                if method != "extract_entities"
+            ]
+            entities = self._execute_pipeline(entities, pipeline_rest)
+            results.append(entities)
+        return L1Output(entities=results)
+@processor_registry.register("l1_gliner")
+def create_l1_gliner_processor(config_dict: dict, pipeline: list = None) -> L1GlinerProcessor:
+    """Factory: creates component + GLiNER processor"""
+    config = L1GlinerConfig(**config_dict)
+    component = L1GlinerComponent(config)
+    return L1GlinerProcessor(config, component, pipeline)

glinker/l2/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from .models import L2Config, L2Input, L2Output, LayerConfig, FuzzyConfig, DatabaseRecord
+from .component import DatabaseChainComponent, DatabaseLayer, DictLayer, RedisLayer, ElasticsearchLayer, PostgresLayer
+from .processor import L2Processor
+__all__ = [
+    "L2Config",
+    "L2Input",
+    "L2Output",
+    "LayerConfig",
+    "FuzzyConfig",
+    "DatabaseRecord",
+    "DatabaseChainComponent",
+    "DatabaseLayer",
+    "DictLayer",
+    "RedisLayer",
+    "ElasticsearchLayer",
+    "PostgresLayer",
+    "L2Processor"
+]