glinker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,284 @@
1
+ import spacy
2
+ from spacy.language import Language
3
+ from typing import List
4
+ import torch
5
+ from glinker.core.base import BaseComponent
6
+ from .models import L1Config, L1GlinerConfig, L1Entity
7
+
8
+
9
+ class L1SpacyComponent(BaseComponent[L1Config]):
10
+ """spaCy-based entity extraction component"""
11
+
12
+ def _setup(self):
13
+ """Initialize spaCy model"""
14
+ self.nlp = self._load_model()
15
+
16
+ def _load_model(self) -> Language:
17
+ """Load or download spaCy model"""
18
+ try:
19
+ nlp = spacy.load(self.config.model)
20
+ if self.config.device != "cpu":
21
+ spacy.require_gpu()
22
+ return nlp
23
+ except OSError:
24
+ from spacy.cli import download
25
+ download(self.config.model)
26
+ return spacy.load(self.config.model)
27
+
28
+ def get_available_methods(self) -> list[str]:
29
+ """Return list of available pipeline methods"""
30
+ return [
31
+ "extract_entities",
32
+ "filter_by_length",
33
+ "deduplicate",
34
+ "sort_by_position",
35
+ "add_noun_chunks"
36
+ ]
37
+
38
+ def extract_entities(self, text: str) -> list[L1Entity]:
39
+ """Extract named entities from text"""
40
+ doc = self.nlp(text)
41
+ entities = []
42
+ seen_spans = set()
43
+
44
+ for ent in doc.ents:
45
+ span = (ent.start_char, ent.end_char)
46
+ if span in seen_spans:
47
+ continue
48
+
49
+ left_context, right_context = self._get_context(
50
+ text, ent.start_char, ent.end_char
51
+ )
52
+
53
+ entities.append(L1Entity(
54
+ text=ent.text,
55
+ start=ent.start_char,
56
+ end=ent.end_char,
57
+ left_context=left_context,
58
+ right_context=right_context
59
+ ))
60
+ seen_spans.add(span)
61
+
62
+ return entities
63
+
64
+ def filter_by_length(
65
+ self,
66
+ entities: list[L1Entity],
67
+ min_length: int = None
68
+ ) -> list[L1Entity]:
69
+ """Filter entities by minimum text length"""
70
+ min_len = min_length if min_length is not None else self.config.min_entity_length
71
+ return [e for e in entities if len(e.text) >= min_len]
72
+
73
+ def deduplicate(self, entities: list[L1Entity]) -> list[L1Entity]:
74
+ """Remove duplicate entities by span"""
75
+ seen_spans = set()
76
+ unique = []
77
+
78
+ for entity in entities:
79
+ span = (entity.start, entity.end)
80
+ if span not in seen_spans:
81
+ unique.append(entity)
82
+ seen_spans.add(span)
83
+
84
+ return unique
85
+
86
+ def sort_by_position(self, entities: list[L1Entity]) -> list[L1Entity]:
87
+ """Sort entities by start position"""
88
+ return sorted(entities, key=lambda x: x.start)
89
+
90
+ def add_noun_chunks(
91
+ self,
92
+ text: str,
93
+ entities: list[L1Entity] = None
94
+ ) -> list[L1Entity]:
95
+ """Add noun chunks to entities list"""
96
+ if entities is None:
97
+ entities = []
98
+
99
+ doc = self.nlp(text)
100
+ seen_spans = {(e.start, e.end) for e in entities}
101
+
102
+ for chunk in doc.noun_chunks:
103
+ span = (chunk.start_char, chunk.end_char)
104
+
105
+ overlap = False
106
+ for (s, e) in seen_spans:
107
+ if not (chunk.end_char <= s or chunk.start_char >= e):
108
+ overlap = True
109
+ break
110
+
111
+ if not overlap and len(chunk.text) >= self.config.min_entity_length:
112
+ left_context, right_context = self._get_context(
113
+ text, chunk.start_char, chunk.end_char
114
+ )
115
+
116
+ entities.append(L1Entity(
117
+ text=chunk.text,
118
+ start=chunk.start_char,
119
+ end=chunk.end_char,
120
+ left_context=left_context,
121
+ right_context=right_context
122
+ ))
123
+ seen_spans.add(span)
124
+
125
+ return entities
126
+
127
+ def _get_context(self, text: str, start: int, end: int) -> tuple[str, str]:
128
+ """Extract left and right context for entity"""
129
+ left_start = max(0, start - self.config.max_left_context)
130
+ left_context = text[left_start:start].strip()
131
+
132
+ right_end = min(len(text), end + self.config.max_right_context)
133
+ right_context = text[end:right_end].strip()
134
+
135
+ return left_context, right_context
136
+
137
+
138
+ class L1GlinerComponent(BaseComponent[L1GlinerConfig]):
139
+ """GLiNER-based entity extraction component for L1"""
140
+
141
+ def _setup(self):
142
+ """Initialize GLiNER model"""
143
+ from gliner import GLiNER
144
+
145
+ self.model = GLiNER.from_pretrained(
146
+ self.config.model,
147
+ token=self.config.token,
148
+ max_length=self.config.max_length
149
+ )
150
+ self.model.to(self.config.device)
151
+
152
+ # Fix labels tokenizer max_length for BiEncoder models
153
+ if (self.config.max_length is not None and
154
+ hasattr(self.model, 'data_processor') and
155
+ hasattr(self.model.data_processor, 'labels_tokenizer')):
156
+ tok = self.model.data_processor.labels_tokenizer
157
+ if tok.model_max_length > 100000:
158
+ tok.model_max_length = self.config.max_length
159
+
160
+ # Precompute label embeddings if requested and model supports it
161
+ self._label_embeddings = None
162
+ if self.config.use_precomputed_embeddings and self.supports_precomputed_embeddings:
163
+ self._label_embeddings = self.encode_labels(self.config.labels)
164
+
165
+ @property
166
+ def supports_precomputed_embeddings(self) -> bool:
167
+ """Check if model supports precomputed embeddings (BiEncoder)"""
168
+ return hasattr(self.model, 'encode_labels') and self.model.config.labels_encoder is not None
169
+
170
+ def get_available_methods(self) -> List[str]:
171
+ """Return list of available pipeline methods"""
172
+ return [
173
+ "extract_entities",
174
+ "filter_by_length",
175
+ "deduplicate",
176
+ "sort_by_position",
177
+ "encode_labels"
178
+ ]
179
+
180
+ def encode_labels(self, labels: List[str], batch_size: int = None) -> torch.Tensor:
181
+ """
182
+ Encode labels using GLiNER's native label encoder.
183
+
184
+ Args:
185
+ labels: List of label strings to encode
186
+ batch_size: Batch size for encoding (defaults to config.batch_size)
187
+
188
+ Returns:
189
+ Tensor of shape (num_labels, hidden_size)
190
+
191
+ Raises:
192
+ NotImplementedError: If model doesn't support label encoding
193
+ """
194
+ if not self.supports_precomputed_embeddings:
195
+ raise NotImplementedError(
196
+ f"Model {self.config.model} doesn't support label precomputation. "
197
+ "Only BiEncoder models support this feature."
198
+ )
199
+
200
+ batch_size = batch_size or self.config.batch_size
201
+ return self.model.encode_labels(labels, batch_size=batch_size)
202
+
203
+ def extract_entities(self, text: str) -> List[L1Entity]:
204
+ """Extract named entities from text using GLiNER"""
205
+ if not self.config.labels:
206
+ return []
207
+
208
+ # Use precomputed embeddings if available
209
+ if self._label_embeddings is not None:
210
+ raw_entities = self.model.predict_with_embeds(
211
+ text,
212
+ self._label_embeddings,
213
+ self.config.labels,
214
+ threshold=self.config.threshold,
215
+ flat_ner=self.config.flat_ner,
216
+ multi_label=self.config.multi_label
217
+ )
218
+ else:
219
+ raw_entities = self.model.predict_entities(
220
+ text,
221
+ self.config.labels,
222
+ threshold=self.config.threshold,
223
+ flat_ner=self.config.flat_ner,
224
+ multi_label=self.config.multi_label
225
+ )
226
+
227
+ entities = []
228
+ seen_spans = set()
229
+
230
+ for ent in raw_entities:
231
+ span = (ent["start"], ent["end"])
232
+ if span in seen_spans:
233
+ continue
234
+
235
+ left_context, right_context = self._get_context(
236
+ text, ent["start"], ent["end"]
237
+ )
238
+
239
+ entities.append(L1Entity(
240
+ text=ent["text"],
241
+ start=ent["start"],
242
+ end=ent["end"],
243
+ left_context=left_context,
244
+ right_context=right_context
245
+ ))
246
+ seen_spans.add(span)
247
+
248
+ return entities
249
+
250
+ def filter_by_length(
251
+ self,
252
+ entities: List[L1Entity],
253
+ min_length: int = None
254
+ ) -> List[L1Entity]:
255
+ """Filter entities by minimum text length"""
256
+ min_len = min_length if min_length is not None else self.config.min_entity_length
257
+ return [e for e in entities if len(e.text) >= min_len]
258
+
259
+ def deduplicate(self, entities: List[L1Entity]) -> List[L1Entity]:
260
+ """Remove duplicate entities by span"""
261
+ seen_spans = set()
262
+ unique = []
263
+
264
+ for entity in entities:
265
+ span = (entity.start, entity.end)
266
+ if span not in seen_spans:
267
+ unique.append(entity)
268
+ seen_spans.add(span)
269
+
270
+ return unique
271
+
272
+ def sort_by_position(self, entities: List[L1Entity]) -> List[L1Entity]:
273
+ """Sort entities by start position"""
274
+ return sorted(entities, key=lambda x: x.start)
275
+
276
+ def _get_context(self, text: str, start: int, end: int) -> tuple[str, str]:
277
+ """Extract left and right context for entity"""
278
+ left_start = max(0, start - self.config.max_left_context)
279
+ left_context = text[left_start:start].strip()
280
+
281
+ right_end = min(len(text), end + self.config.max_right_context)
282
+ right_context = text[end:right_end].strip()
283
+
284
+ return left_context, right_context
glinker/l1/models.py ADDED
@@ -0,0 +1,47 @@
1
+ from pydantic import Field
2
+ from typing import List, Optional
3
+ from glinker.core.base import BaseConfig, BaseInput, BaseOutput
4
+
5
+
6
+ class L1Config(BaseConfig):
7
+ model: str = Field("en_core_sci_sm", description="spaCy model identifier")
8
+ device: str = Field("cpu", description="Device to run the model on")
9
+ batch_size: int = Field(16, description="Batch size for processing")
10
+ max_right_context: int = Field(50, description="Maximum right context length")
11
+ max_left_context: int = Field(50, description="Maximum left context length")
12
+ min_entity_length: int = Field(2, description="Minimum entity text length")
13
+ include_noun_chunks: bool = Field(False, description="Include noun chunks")
14
+
15
+
16
+ class L1GlinerConfig(L1Config):
17
+ """Configuration for GLiNER-based L1 entity extraction"""
18
+ model: str = Field(..., description="GLiNER model identifier (overrides spaCy model)")
19
+ labels: List[str] = Field(..., description="Fixed list of labels for entity extraction")
20
+ token: Optional[str] = Field(None, description="HuggingFace token")
21
+ threshold: float = Field(0.3, description="Confidence threshold for entity extraction")
22
+ flat_ner: bool = Field(True, description="Use flat NER (no nested entities)")
23
+ multi_label: bool = Field(False, description="Allow multiple labels per entity")
24
+ use_precomputed_embeddings: bool = Field(
25
+ False,
26
+ description="Use precomputed label embeddings (BiEncoder only)"
27
+ )
28
+ max_length: Optional[int] = Field(
29
+ None,
30
+ description="Maximum sequence length for tokenization"
31
+ )
32
+
33
+
34
+ class L1Input(BaseInput):
35
+ texts: list[str] = Field(..., description="List of text inputs")
36
+
37
+
38
+ class L1Entity(BaseOutput):
39
+ text: str = Field(..., description="Extracted mention text")
40
+ start: int = Field(..., description="Start position")
41
+ end: int = Field(..., description="End position")
42
+ left_context: str = Field(..., description="Left context")
43
+ right_context: str = Field(..., description="Right context")
44
+
45
+
46
+ class L1Output(BaseOutput):
47
+ entities: list[list[L1Entity]] = Field(..., description="Extracted entities per text")
@@ -0,0 +1,152 @@
1
+ from typing import Any, List, Union
2
+ from glinker.core.base import BaseProcessor
3
+ from glinker.core.registry import processor_registry
4
+ from .models import L1Config, L1GlinerConfig, L1Input, L1Output
5
+ from .component import L1SpacyComponent, L1GlinerComponent
6
+
7
+
8
+ class L1SpacyProcessor(BaseProcessor[L1Config, L1Input, L1Output]):
9
+ """Optimized batch processor using spaCy pipe"""
10
+
11
+ def __init__(
12
+ self,
13
+ config: L1Config,
14
+ component: L1SpacyComponent,
15
+ pipeline: list[tuple[str, dict[str, Any]]] = None
16
+ ):
17
+ super().__init__(config, component, pipeline)
18
+ self._validate_pipeline()
19
+
20
+ def _default_pipeline(self) -> list[tuple[str, dict[str, Any]]]:
21
+ return [
22
+ ("extract_entities", {}),
23
+ ("deduplicate", {}),
24
+ ("sort_by_position", {})
25
+ ]
26
+
27
+ def __call__(
28
+ self,
29
+ texts: List[str] = None,
30
+ input_data: L1Input = None
31
+ ) -> L1Output:
32
+ """Process batch using spaCy's efficient pipe"""
33
+
34
+ # Support both direct texts and L1Input
35
+ if texts is not None:
36
+ texts_to_process = texts
37
+ elif input_data is not None:
38
+ texts_to_process = input_data.texts
39
+ else:
40
+ raise ValueError("Either 'texts' or 'input_data' must be provided")
41
+
42
+ results = []
43
+
44
+ for doc, original_text in zip(
45
+ self.component.nlp.pipe(
46
+ texts_to_process,
47
+ batch_size=self.config.batch_size
48
+ ),
49
+ texts_to_process
50
+ ):
51
+ entities = self._extract_from_doc(doc, original_text)
52
+
53
+ pipeline_rest = [
54
+ (method, kwargs)
55
+ for method, kwargs in self.pipeline
56
+ if method != "extract_entities"
57
+ ]
58
+
59
+ entities = self._execute_pipeline(entities, pipeline_rest)
60
+ results.append(entities)
61
+
62
+ return L1Output(entities=results)
63
+
64
+ def _extract_from_doc(self, doc, text: str) -> list:
65
+ """Extract entities from already processed doc"""
66
+ from .models import L1Entity
67
+
68
+ entities = []
69
+ for ent in doc.ents:
70
+ left_context, right_context = self.component._get_context(
71
+ text, ent.start_char, ent.end_char
72
+ )
73
+
74
+ entities.append(L1Entity(
75
+ text=ent.text,
76
+ start=ent.start_char,
77
+ end=ent.end_char,
78
+ left_context=left_context,
79
+ right_context=right_context
80
+ ))
81
+
82
+ return entities
83
+
84
+
85
+ @processor_registry.register("l1_spacy")
86
+ def create_l1_spacy_processor(config_dict: dict, pipeline: list = None) -> L1SpacyProcessor:
87
+ """Factory: creates component + batch processor"""
88
+ config = L1Config(**config_dict)
89
+ component = L1SpacyComponent(config)
90
+ return L1SpacyProcessor(config, component, pipeline)
91
+
92
+
93
+ class L1GlinerProcessor(BaseProcessor[L1GlinerConfig, L1Input, L1Output]):
94
+ """GLiNER-based batch processor for L1 entity extraction"""
95
+
96
+ def __init__(
97
+ self,
98
+ config: L1GlinerConfig,
99
+ component: L1GlinerComponent,
100
+ pipeline: list[tuple[str, dict[str, Any]]] = None
101
+ ):
102
+ super().__init__(config, component, pipeline)
103
+ self._validate_pipeline()
104
+
105
+ def _default_pipeline(self) -> list[tuple[str, dict[str, Any]]]:
106
+ return [
107
+ ("extract_entities", {}),
108
+ ("deduplicate", {}),
109
+ ("sort_by_position", {})
110
+ ]
111
+
112
+ def __call__(
113
+ self,
114
+ texts: List[str] = None,
115
+ input_data: L1Input = None
116
+ ) -> L1Output:
117
+ """Process batch of texts using GLiNER"""
118
+
119
+ # Support both direct texts and L1Input
120
+ if texts is not None:
121
+ texts_to_process = texts
122
+ elif input_data is not None:
123
+ texts_to_process = input_data.texts
124
+ else:
125
+ raise ValueError("Either 'texts' or 'input_data' must be provided")
126
+
127
+ results = []
128
+
129
+ # Process each text individually
130
+ for text in texts_to_process:
131
+ # Extract entities using component
132
+ entities = self.component.extract_entities(text)
133
+
134
+ # Apply rest of pipeline (skip extract_entities as already done)
135
+ pipeline_rest = [
136
+ (method, kwargs)
137
+ for method, kwargs in self.pipeline
138
+ if method != "extract_entities"
139
+ ]
140
+
141
+ entities = self._execute_pipeline(entities, pipeline_rest)
142
+ results.append(entities)
143
+
144
+ return L1Output(entities=results)
145
+
146
+
147
+ @processor_registry.register("l1_gliner")
148
+ def create_l1_gliner_processor(config_dict: dict, pipeline: list = None) -> L1GlinerProcessor:
149
+ """Factory: creates component + GLiNER processor"""
150
+ config = L1GlinerConfig(**config_dict)
151
+ component = L1GlinerComponent(config)
152
+ return L1GlinerProcessor(config, component, pipeline)
glinker/l2/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ from .models import L2Config, L2Input, L2Output, LayerConfig, FuzzyConfig, DatabaseRecord
2
+ from .component import DatabaseChainComponent, DatabaseLayer, DictLayer, RedisLayer, ElasticsearchLayer, PostgresLayer
3
+ from .processor import L2Processor
4
+
5
+ __all__ = [
6
+ "L2Config",
7
+ "L2Input",
8
+ "L2Output",
9
+ "LayerConfig",
10
+ "FuzzyConfig",
11
+ "DatabaseRecord",
12
+ "DatabaseChainComponent",
13
+ "DatabaseLayer",
14
+ "DictLayer",
15
+ "RedisLayer",
16
+ "ElasticsearchLayer",
17
+ "PostgresLayer",
18
+ "L2Processor"
19
+ ]