glinker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glinker/__init__.py +54 -0
- glinker/core/__init__.py +56 -0
- glinker/core/base.py +103 -0
- glinker/core/builders.py +547 -0
- glinker/core/dag.py +898 -0
- glinker/core/factory.py +261 -0
- glinker/core/registry.py +31 -0
- glinker/l0/__init__.py +21 -0
- glinker/l0/component.py +472 -0
- glinker/l0/models.py +90 -0
- glinker/l0/processor.py +108 -0
- glinker/l1/__init__.py +15 -0
- glinker/l1/component.py +284 -0
- glinker/l1/models.py +47 -0
- glinker/l1/processor.py +152 -0
- glinker/l2/__init__.py +19 -0
- glinker/l2/component.py +1220 -0
- glinker/l2/models.py +99 -0
- glinker/l2/processor.py +170 -0
- glinker/l3/__init__.py +12 -0
- glinker/l3/component.py +184 -0
- glinker/l3/models.py +48 -0
- glinker/l3/processor.py +350 -0
- glinker/l4/__init__.py +9 -0
- glinker/l4/component.py +121 -0
- glinker/l4/models.py +21 -0
- glinker/l4/processor.py +156 -0
- glinker/py.typed +1 -0
- glinker-0.1.0.dist-info/METADATA +994 -0
- glinker-0.1.0.dist-info/RECORD +33 -0
- glinker-0.1.0.dist-info/WHEEL +5 -0
- glinker-0.1.0.dist-info/licenses/LICENSE +201 -0
- glinker-0.1.0.dist-info/top_level.txt +1 -0
glinker/l0/component.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
from typing import List, Optional, Dict, Tuple
|
|
2
|
+
from glinker.core.base import BaseComponent
|
|
3
|
+
from .models import (
|
|
4
|
+
L0Config, L0Entity, LinkedEntity
|
|
5
|
+
)
|
|
6
|
+
from glinker.l1.models import L1Entity
|
|
7
|
+
from glinker.l2.models import DatabaseRecord
|
|
8
|
+
from glinker.l3.models import L3Entity
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class L0Component(BaseComponent[L0Config]):
|
|
12
|
+
"""
|
|
13
|
+
L0 aggregation component - combines outputs from L1, L2, L3
|
|
14
|
+
|
|
15
|
+
Workflow:
|
|
16
|
+
1. For each L1 mention → find its L2 candidates
|
|
17
|
+
2. For each L1 mention → check if it was linked in L3
|
|
18
|
+
3. Create L0Entity with full information from all layers
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def get_available_methods(self) -> List[str]:
|
|
22
|
+
return [
|
|
23
|
+
"aggregate",
|
|
24
|
+
"filter_by_confidence",
|
|
25
|
+
"sort_by_confidence",
|
|
26
|
+
"calculate_stats"
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
def aggregate(
|
|
30
|
+
self,
|
|
31
|
+
l1_entities: List[List[L1Entity]],
|
|
32
|
+
l2_candidates: List[List[DatabaseRecord]],
|
|
33
|
+
l3_entities: List[List[L3Entity]],
|
|
34
|
+
template: str = "{label}"
|
|
35
|
+
) -> List[List[L0Entity]]:
|
|
36
|
+
"""
|
|
37
|
+
Main aggregation method - combines all layers
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
l1_entities: [[L1Entity, ...], ...] - one list per text
|
|
41
|
+
l2_candidates: [[DatabaseRecord, ...], ...] - one list per text
|
|
42
|
+
l3_entities: [[L3Entity, ...], ...] - one list per text
|
|
43
|
+
template: Label template from L3 schema (e.g., "{label} {description}")
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
[[L0Entity, ...], ...] - aggregated entities per text
|
|
47
|
+
"""
|
|
48
|
+
all_results = []
|
|
49
|
+
|
|
50
|
+
# Process each text separately
|
|
51
|
+
for text_idx in range(len(l1_entities)):
|
|
52
|
+
l1_mentions = l1_entities[text_idx] if text_idx < len(l1_entities) else []
|
|
53
|
+
l2_cands = l2_candidates[text_idx] if text_idx < len(l2_candidates) else []
|
|
54
|
+
l3_links = l3_entities[text_idx] if text_idx < len(l3_entities) else []
|
|
55
|
+
|
|
56
|
+
text_results = self._aggregate_single_text(l1_mentions, l2_cands, l3_links, template)
|
|
57
|
+
all_results.append(text_results)
|
|
58
|
+
|
|
59
|
+
return all_results
|
|
60
|
+
|
|
61
|
+
def _aggregate_single_text(
|
|
62
|
+
self,
|
|
63
|
+
l1_mentions: List[L1Entity],
|
|
64
|
+
l2_candidates: List[DatabaseRecord],
|
|
65
|
+
l3_links: List[L3Entity],
|
|
66
|
+
template: str = "{label}"
|
|
67
|
+
) -> List[L0Entity]:
|
|
68
|
+
"""
|
|
69
|
+
Aggregate data for a single text
|
|
70
|
+
|
|
71
|
+
Strategy:
|
|
72
|
+
1. Build index of L3 linked entities by position
|
|
73
|
+
2. For each L1 mention:
|
|
74
|
+
- Find corresponding candidates from L2
|
|
75
|
+
- Check if it was linked in L3
|
|
76
|
+
- Create L0Entity
|
|
77
|
+
3. If strict_matching=False, also include L3 entities outside L1 mentions
|
|
78
|
+
"""
|
|
79
|
+
# Build L3 index by position
|
|
80
|
+
l3_by_position = self._build_l3_index(l3_links)
|
|
81
|
+
|
|
82
|
+
results = []
|
|
83
|
+
used_l3_positions = set() # Track which L3 entities were matched to L1 mentions
|
|
84
|
+
|
|
85
|
+
for mention_idx, l1_mention in enumerate(l1_mentions):
|
|
86
|
+
# Get candidates for this mention (L2 returns flat list, need to group)
|
|
87
|
+
# Assuming L2 candidates are in the same order as L1 mentions
|
|
88
|
+
mention_candidates = self._get_candidates_for_mention(
|
|
89
|
+
mention_idx, l1_mention, l2_candidates
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Check if this mention was linked in L3
|
|
93
|
+
linked_entity, l3_pos = self._find_linked_entity_with_position(
|
|
94
|
+
l1_mention, l3_by_position, mention_candidates, template,
|
|
95
|
+
tolerance=self.config.position_tolerance
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if l3_pos:
|
|
99
|
+
used_l3_positions.add(l3_pos)
|
|
100
|
+
|
|
101
|
+
# Build candidate_scores from L3 class_probs
|
|
102
|
+
candidate_scores = {}
|
|
103
|
+
l3_entity = l3_by_position.get(l3_pos) if l3_pos else None
|
|
104
|
+
if l3_entity and l3_entity.class_probs:
|
|
105
|
+
candidate_scores = self._build_candidate_scores(
|
|
106
|
+
l3_entity.class_probs, l2_candidates, template
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Determine pipeline stage
|
|
110
|
+
pipeline_stage = self._determine_stage(mention_candidates, linked_entity)
|
|
111
|
+
|
|
112
|
+
# Create L0Entity
|
|
113
|
+
l0_entity = L0Entity(
|
|
114
|
+
mention_text=l1_mention.text,
|
|
115
|
+
mention_start=l1_mention.start,
|
|
116
|
+
mention_end=l1_mention.end,
|
|
117
|
+
left_context=l1_mention.left_context,
|
|
118
|
+
right_context=l1_mention.right_context,
|
|
119
|
+
candidates=mention_candidates,
|
|
120
|
+
num_candidates=len(mention_candidates),
|
|
121
|
+
linked_entity=linked_entity,
|
|
122
|
+
is_linked=linked_entity is not None,
|
|
123
|
+
candidate_scores=candidate_scores,
|
|
124
|
+
pipeline_stage=pipeline_stage
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
results.append(l0_entity)
|
|
128
|
+
|
|
129
|
+
# If loose mode, include L3 entities that weren't matched to L1 mentions
|
|
130
|
+
if not self.config.strict_matching:
|
|
131
|
+
for (l3_start, l3_end), l3_entity in l3_by_position.items():
|
|
132
|
+
if (l3_start, l3_end) not in used_l3_positions:
|
|
133
|
+
# This L3 entity was not matched to any L1 mention
|
|
134
|
+
# Find candidate by label
|
|
135
|
+
matched_candidate = self._match_candidate_by_label(
|
|
136
|
+
l3_entity.label, l2_candidates, template
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Build candidate_scores from class_probs
|
|
140
|
+
candidate_scores = {}
|
|
141
|
+
if l3_entity.class_probs:
|
|
142
|
+
candidate_scores = self._build_candidate_scores(
|
|
143
|
+
l3_entity.class_probs, l2_candidates, template
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
linked = LinkedEntity(
|
|
147
|
+
entity_id=matched_candidate.entity_id if matched_candidate else "unknown",
|
|
148
|
+
label=matched_candidate.label if matched_candidate else l3_entity.label,
|
|
149
|
+
confidence=l3_entity.score,
|
|
150
|
+
start=l3_entity.start,
|
|
151
|
+
end=l3_entity.end,
|
|
152
|
+
matched_text=l3_entity.text
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
l0_entity = L0Entity(
|
|
156
|
+
mention_text=l3_entity.text,
|
|
157
|
+
mention_start=l3_entity.start,
|
|
158
|
+
mention_end=l3_entity.end,
|
|
159
|
+
left_context="", # No context from L1
|
|
160
|
+
right_context="",
|
|
161
|
+
candidates=[matched_candidate] if matched_candidate else [],
|
|
162
|
+
num_candidates=1 if matched_candidate else 0,
|
|
163
|
+
linked_entity=linked,
|
|
164
|
+
is_linked=True,
|
|
165
|
+
candidate_scores=candidate_scores,
|
|
166
|
+
pipeline_stage="l3_only" # Indicates L3 found it without L1
|
|
167
|
+
)
|
|
168
|
+
results.append(l0_entity)
|
|
169
|
+
|
|
170
|
+
return results
|
|
171
|
+
|
|
172
|
+
def _build_l3_index(self, l3_links: List[L3Entity]) -> Dict[Tuple[int, int], L3Entity]:
|
|
173
|
+
"""Build index of L3 entities by (start, end) position"""
|
|
174
|
+
index = {}
|
|
175
|
+
for entity in l3_links:
|
|
176
|
+
key = (entity.start, entity.end)
|
|
177
|
+
index[key] = entity
|
|
178
|
+
return index
|
|
179
|
+
|
|
180
|
+
def _get_candidates_for_mention(
|
|
181
|
+
self,
|
|
182
|
+
mention_idx: int,
|
|
183
|
+
l1_mention: L1Entity,
|
|
184
|
+
all_candidates: List[DatabaseRecord]
|
|
185
|
+
) -> List[DatabaseRecord]:
|
|
186
|
+
"""
|
|
187
|
+
Get candidates for specific mention
|
|
188
|
+
|
|
189
|
+
Note: L2 returns candidates grouped per text. We need to match by text content.
|
|
190
|
+
"""
|
|
191
|
+
matched_candidates = []
|
|
192
|
+
|
|
193
|
+
# Match candidates by mention text (normalize)
|
|
194
|
+
mention_text_lower = l1_mention.text.lower().strip()
|
|
195
|
+
|
|
196
|
+
for candidate in all_candidates:
|
|
197
|
+
# Check if candidate matches this mention
|
|
198
|
+
if candidate.label.lower().strip() == mention_text_lower:
|
|
199
|
+
matched_candidates.append(candidate)
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
# Check aliases
|
|
203
|
+
for alias in candidate.aliases:
|
|
204
|
+
if alias.lower().strip() == mention_text_lower:
|
|
205
|
+
matched_candidates.append(candidate)
|
|
206
|
+
break
|
|
207
|
+
|
|
208
|
+
return matched_candidates
|
|
209
|
+
|
|
210
|
+
def _find_linked_entity(
|
|
211
|
+
self,
|
|
212
|
+
l1_mention: L1Entity,
|
|
213
|
+
l3_by_position: Dict[Tuple[int, int], L3Entity],
|
|
214
|
+
candidates: List[DatabaseRecord],
|
|
215
|
+
template: str = "{label}"
|
|
216
|
+
) -> Optional[LinkedEntity]:
|
|
217
|
+
"""
|
|
218
|
+
Find if this L1 mention was linked in L3
|
|
219
|
+
|
|
220
|
+
Strategy:
|
|
221
|
+
1. Look up L3 entity by position (start, end)
|
|
222
|
+
2. If found, match with candidates to get entity_id
|
|
223
|
+
3. Return LinkedEntity with full information
|
|
224
|
+
"""
|
|
225
|
+
linked, _ = self._find_linked_entity_with_position(
|
|
226
|
+
l1_mention, l3_by_position, candidates, template
|
|
227
|
+
)
|
|
228
|
+
return linked
|
|
229
|
+
|
|
230
|
+
def _find_linked_entity_with_position(
|
|
231
|
+
self,
|
|
232
|
+
l1_mention: L1Entity,
|
|
233
|
+
l3_by_position: Dict[Tuple[int, int], L3Entity],
|
|
234
|
+
candidates: List[DatabaseRecord],
|
|
235
|
+
template: str = "{label}",
|
|
236
|
+
tolerance: int = 2
|
|
237
|
+
) -> Tuple[Optional[LinkedEntity], Optional[Tuple[int, int]]]:
|
|
238
|
+
"""
|
|
239
|
+
Find if this L1 mention was linked in L3, and return the matched position
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Tuple of (LinkedEntity or None, matched position tuple or None)
|
|
243
|
+
"""
|
|
244
|
+
# Try exact position match
|
|
245
|
+
key = (l1_mention.start, l1_mention.end)
|
|
246
|
+
l3_entity = l3_by_position.get(key)
|
|
247
|
+
matched_key = key if l3_entity else None
|
|
248
|
+
|
|
249
|
+
if not l3_entity:
|
|
250
|
+
# Try fuzzy position match (text might be slightly different)
|
|
251
|
+
l3_entity, matched_key = self._fuzzy_position_match_with_key(
|
|
252
|
+
l1_mention.start, l1_mention.end, l3_by_position, tolerance
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
if not l3_entity:
|
|
256
|
+
return None, None
|
|
257
|
+
|
|
258
|
+
# Find matching candidate by label using template
|
|
259
|
+
matched_candidate = self._match_candidate_by_label(l3_entity.label, candidates, template)
|
|
260
|
+
|
|
261
|
+
if not matched_candidate:
|
|
262
|
+
# L3 found entity but no matching candidate - shouldn't happen but handle gracefully
|
|
263
|
+
return LinkedEntity(
|
|
264
|
+
entity_id="unknown",
|
|
265
|
+
label=l3_entity.label,
|
|
266
|
+
confidence=l3_entity.score,
|
|
267
|
+
start=l3_entity.start,
|
|
268
|
+
end=l3_entity.end,
|
|
269
|
+
matched_text=l3_entity.text
|
|
270
|
+
), matched_key
|
|
271
|
+
|
|
272
|
+
return LinkedEntity(
|
|
273
|
+
entity_id=matched_candidate.entity_id,
|
|
274
|
+
label=matched_candidate.label,
|
|
275
|
+
confidence=l3_entity.score,
|
|
276
|
+
start=l3_entity.start,
|
|
277
|
+
end=l3_entity.end,
|
|
278
|
+
matched_text=l3_entity.text
|
|
279
|
+
), matched_key
|
|
280
|
+
|
|
281
|
+
def _fuzzy_position_match(
|
|
282
|
+
self,
|
|
283
|
+
start: int,
|
|
284
|
+
end: int,
|
|
285
|
+
l3_by_position: Dict[Tuple[int, int], L3Entity],
|
|
286
|
+
tolerance: int = 2
|
|
287
|
+
) -> Optional[L3Entity]:
|
|
288
|
+
"""Find L3 entity with position close to given range"""
|
|
289
|
+
entity, _ = self._fuzzy_position_match_with_key(start, end, l3_by_position, tolerance)
|
|
290
|
+
return entity
|
|
291
|
+
|
|
292
|
+
def _fuzzy_position_match_with_key(
|
|
293
|
+
self,
|
|
294
|
+
start: int,
|
|
295
|
+
end: int,
|
|
296
|
+
l3_by_position: Dict[Tuple[int, int], L3Entity],
|
|
297
|
+
tolerance: int = 2
|
|
298
|
+
) -> Tuple[Optional[L3Entity], Optional[Tuple[int, int]]]:
|
|
299
|
+
"""Find L3 entity with position close to given range, return with its key"""
|
|
300
|
+
for (l3_start, l3_end), entity in l3_by_position.items():
|
|
301
|
+
if abs(l3_start - start) <= tolerance and abs(l3_end - end) <= tolerance:
|
|
302
|
+
return entity, (l3_start, l3_end)
|
|
303
|
+
return None, None
|
|
304
|
+
|
|
305
|
+
def _build_candidate_scores(
|
|
306
|
+
self,
|
|
307
|
+
class_probs: Dict[str, float],
|
|
308
|
+
candidates: List[DatabaseRecord],
|
|
309
|
+
template: str = "{label}"
|
|
310
|
+
) -> Dict[str, float]:
|
|
311
|
+
"""
|
|
312
|
+
Map L3 class_probs (label -> probability) to candidate entity_ids.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
class_probs: Dict of label -> probability from L3 entity
|
|
316
|
+
candidates: L2 candidate records
|
|
317
|
+
template: Schema template used to format labels in L3
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
Dict of entity_id -> probability
|
|
321
|
+
"""
|
|
322
|
+
scores = {}
|
|
323
|
+
for label, prob in class_probs.items():
|
|
324
|
+
matched = self._match_candidate_by_label(label, candidates, template)
|
|
325
|
+
if matched:
|
|
326
|
+
scores[matched.entity_id] = prob
|
|
327
|
+
return scores
|
|
328
|
+
|
|
329
|
+
def _match_candidate_by_label(
|
|
330
|
+
self,
|
|
331
|
+
l3_label: str,
|
|
332
|
+
candidates: List[DatabaseRecord],
|
|
333
|
+
template: str = "{label}"
|
|
334
|
+
) -> Optional[DatabaseRecord]:
|
|
335
|
+
"""
|
|
336
|
+
Match L3 label with L2 candidate using the same template
|
|
337
|
+
|
|
338
|
+
Uses the schema template to format candidate labels the same way L3 did,
|
|
339
|
+
enabling exact matching.
|
|
340
|
+
|
|
341
|
+
Example:
|
|
342
|
+
template = "{label} {description}"
|
|
343
|
+
L3 label = "TP53 Tumor suppressor gene..."
|
|
344
|
+
candidate formatted = "TP53 Tumor suppressor gene..." -> MATCH!
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
l3_label: Label from L3 entity (formatted with template)
|
|
348
|
+
candidates: List of candidates from L2
|
|
349
|
+
template: Template string (e.g., "{label} {description}")
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Matched DatabaseRecord or None
|
|
353
|
+
"""
|
|
354
|
+
l3_label_lower = l3_label.lower().strip()
|
|
355
|
+
|
|
356
|
+
# Try to match by formatting each candidate with the template
|
|
357
|
+
for candidate in candidates:
|
|
358
|
+
try:
|
|
359
|
+
# Format candidate using same template as L3
|
|
360
|
+
if hasattr(candidate, 'dict'):
|
|
361
|
+
cand_dict = candidate.dict()
|
|
362
|
+
else:
|
|
363
|
+
cand_dict = {
|
|
364
|
+
'label': candidate.label,
|
|
365
|
+
'description': getattr(candidate, 'description', ''),
|
|
366
|
+
'entity_id': getattr(candidate, 'entity_id', ''),
|
|
367
|
+
'entity_type': getattr(candidate, 'entity_type', ''),
|
|
368
|
+
'popularity': getattr(candidate, 'popularity', 0),
|
|
369
|
+
'aliases': getattr(candidate, 'aliases', [])
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
formatted_label = template.format(**cand_dict)
|
|
373
|
+
|
|
374
|
+
if formatted_label.lower().strip() == l3_label_lower:
|
|
375
|
+
return candidate
|
|
376
|
+
|
|
377
|
+
except (KeyError, AttributeError):
|
|
378
|
+
# Template formatting failed, try simple label match
|
|
379
|
+
if candidate.label.lower().strip() == l3_label_lower:
|
|
380
|
+
return candidate
|
|
381
|
+
|
|
382
|
+
# Fallback: try simple contains match (for robustness)
|
|
383
|
+
for candidate in candidates:
|
|
384
|
+
cand_label_lower = candidate.label.lower().strip()
|
|
385
|
+
if cand_label_lower and cand_label_lower in l3_label_lower:
|
|
386
|
+
return candidate
|
|
387
|
+
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
def _determine_stage(
|
|
391
|
+
self,
|
|
392
|
+
candidates: List[DatabaseRecord],
|
|
393
|
+
linked_entity: Optional[LinkedEntity]
|
|
394
|
+
) -> str:
|
|
395
|
+
"""Determine which pipeline stage was last successful"""
|
|
396
|
+
if linked_entity:
|
|
397
|
+
return "l3_linked"
|
|
398
|
+
elif candidates:
|
|
399
|
+
return "l2_found"
|
|
400
|
+
else:
|
|
401
|
+
return "l1_only"
|
|
402
|
+
|
|
403
|
+
def filter_by_confidence(
|
|
404
|
+
self,
|
|
405
|
+
entities: List[List[L0Entity]],
|
|
406
|
+
min_confidence: float = None
|
|
407
|
+
) -> List[List[L0Entity]]:
|
|
408
|
+
"""Filter entities by linking confidence"""
|
|
409
|
+
threshold = min_confidence if min_confidence is not None else self.config.min_confidence
|
|
410
|
+
|
|
411
|
+
filtered = []
|
|
412
|
+
for text_entities in entities:
|
|
413
|
+
filtered_text = [
|
|
414
|
+
e for e in text_entities
|
|
415
|
+
if e.linked_entity and e.linked_entity.confidence >= threshold
|
|
416
|
+
]
|
|
417
|
+
filtered.append(filtered_text)
|
|
418
|
+
|
|
419
|
+
return filtered
|
|
420
|
+
|
|
421
|
+
def sort_by_confidence(self, entities: List[List[L0Entity]]) -> List[List[L0Entity]]:
|
|
422
|
+
"""Sort entities by linking confidence (descending)"""
|
|
423
|
+
sorted_results = []
|
|
424
|
+
for text_entities in entities:
|
|
425
|
+
sorted_text = sorted(
|
|
426
|
+
text_entities,
|
|
427
|
+
key=lambda e: e.linked_entity.confidence if e.linked_entity else 0.0,
|
|
428
|
+
reverse=True
|
|
429
|
+
)
|
|
430
|
+
sorted_results.append(sorted_text)
|
|
431
|
+
return sorted_results
|
|
432
|
+
|
|
433
|
+
def calculate_stats(self, entities: List[List[L0Entity]]) -> dict:
|
|
434
|
+
"""Calculate pipeline statistics"""
|
|
435
|
+
total = 0
|
|
436
|
+
linked = 0
|
|
437
|
+
unlinked = 0
|
|
438
|
+
l1_only = 0
|
|
439
|
+
l2_found = 0
|
|
440
|
+
l3_linked = 0
|
|
441
|
+
l3_only = 0 # L3 entities found without L1 mentions (loose mode)
|
|
442
|
+
|
|
443
|
+
for text_entities in entities:
|
|
444
|
+
for entity in text_entities:
|
|
445
|
+
total += 1
|
|
446
|
+
|
|
447
|
+
if entity.is_linked:
|
|
448
|
+
linked += 1
|
|
449
|
+
else:
|
|
450
|
+
unlinked += 1
|
|
451
|
+
|
|
452
|
+
if entity.pipeline_stage == "l1_only":
|
|
453
|
+
l1_only += 1
|
|
454
|
+
elif entity.pipeline_stage == "l2_found":
|
|
455
|
+
l2_found += 1
|
|
456
|
+
elif entity.pipeline_stage == "l3_linked":
|
|
457
|
+
l3_linked += 1
|
|
458
|
+
elif entity.pipeline_stage == "l3_only":
|
|
459
|
+
l3_only += 1
|
|
460
|
+
|
|
461
|
+
return {
|
|
462
|
+
"total_mentions": total,
|
|
463
|
+
"linked": linked,
|
|
464
|
+
"unlinked": unlinked,
|
|
465
|
+
"linking_rate": linked / total if total > 0 else 0.0,
|
|
466
|
+
"stages": {
|
|
467
|
+
"l1_only": l1_only,
|
|
468
|
+
"l2_found": l2_found,
|
|
469
|
+
"l3_linked": l3_linked,
|
|
470
|
+
"l3_only": l3_only
|
|
471
|
+
}
|
|
472
|
+
}
|
glinker/l0/models.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from pydantic import Field
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
from glinker.core.base import BaseConfig, BaseInput, BaseOutput
|
|
4
|
+
from glinker.l1.models import L1Entity
|
|
5
|
+
from glinker.l2.models import DatabaseRecord
|
|
6
|
+
from glinker.l3.models import L3Entity
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class L0Config(BaseConfig):
|
|
10
|
+
"""L0 aggregation configuration"""
|
|
11
|
+
min_confidence: float = Field(0.0, description="Minimum confidence threshold for linked entities")
|
|
12
|
+
include_unlinked: bool = Field(True, description="Include mentions without linked entities")
|
|
13
|
+
return_all_candidates: bool = Field(False, description="Return all candidates or only top match")
|
|
14
|
+
strict_matching: bool = Field(
|
|
15
|
+
True,
|
|
16
|
+
description="If True, only include entities that match L1 mentions. "
|
|
17
|
+
"If False, also include L3 entities found outside L1 mentions."
|
|
18
|
+
)
|
|
19
|
+
position_tolerance: int = Field(
|
|
20
|
+
2,
|
|
21
|
+
description="Maximum character difference for fuzzy position matching between L1 and L3 entities"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class L0Input(BaseInput):
|
|
26
|
+
"""L0 processor input - outputs from L1, L2, L3"""
|
|
27
|
+
l1_entities: List[List[L1Entity]] = Field(..., description="Entities from L1 (per text)")
|
|
28
|
+
l2_candidates: List[List[DatabaseRecord]] = Field(..., description="Candidates from L2 (per mention)")
|
|
29
|
+
l3_entities: List[List[L3Entity]] = Field(..., description="Linked entities from L3 (per text)")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class LinkedEntity(BaseOutput):
|
|
33
|
+
"""Linked entity information from L3"""
|
|
34
|
+
entity_id: str = Field(..., description="Entity ID from matched candidate")
|
|
35
|
+
label: str = Field(..., description="Entity label")
|
|
36
|
+
confidence: float = Field(..., description="Linking confidence score from L3")
|
|
37
|
+
start: int = Field(..., description="Start position in text")
|
|
38
|
+
end: int = Field(..., description="End position in text")
|
|
39
|
+
matched_text: str = Field(..., description="Matched text from L3")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class L0Entity(BaseOutput):
|
|
43
|
+
"""
|
|
44
|
+
Aggregated entity combining information from all layers:
|
|
45
|
+
- L1: mention detection (text, position, context)
|
|
46
|
+
- L2: candidates (entity database records)
|
|
47
|
+
- L3: disambiguation (linked entity with confidence)
|
|
48
|
+
"""
|
|
49
|
+
# From L1 - mention detection
|
|
50
|
+
mention_text: str = Field(..., description="Extracted mention text from L1")
|
|
51
|
+
mention_start: int = Field(..., description="Start position in original text")
|
|
52
|
+
mention_end: int = Field(..., description="End position in original text")
|
|
53
|
+
left_context: str = Field(..., description="Left context from L1")
|
|
54
|
+
right_context: str = Field(..., description="Right context from L1")
|
|
55
|
+
|
|
56
|
+
# From L2 - candidate retrieval
|
|
57
|
+
candidates: List[DatabaseRecord] = Field(
|
|
58
|
+
default_factory=list,
|
|
59
|
+
description="All candidates found in L2 for this mention"
|
|
60
|
+
)
|
|
61
|
+
num_candidates: int = Field(0, description="Number of candidates found")
|
|
62
|
+
|
|
63
|
+
# From L3 - entity linking
|
|
64
|
+
linked_entity: Optional[LinkedEntity] = Field(
|
|
65
|
+
None,
|
|
66
|
+
description="Linked entity if disambiguation was successful"
|
|
67
|
+
)
|
|
68
|
+
is_linked: bool = Field(False, description="Whether entity was successfully linked")
|
|
69
|
+
candidate_scores: Dict[str, float] = Field(
|
|
70
|
+
default_factory=dict,
|
|
71
|
+
description="L3 class probability per candidate entity_id"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Aggregated metadata
|
|
75
|
+
pipeline_stage: str = Field(
|
|
76
|
+
"",
|
|
77
|
+
description="Last successful stage: 'l1_only', 'l2_found', 'l3_linked'"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class L0Output(BaseOutput):
|
|
82
|
+
"""L0 processor output"""
|
|
83
|
+
entities: List[List[L0Entity]] = Field(
|
|
84
|
+
...,
|
|
85
|
+
description="Aggregated entities per text with full pipeline information"
|
|
86
|
+
)
|
|
87
|
+
stats: dict = Field(
|
|
88
|
+
default_factory=dict,
|
|
89
|
+
description="Pipeline statistics (total, linked, unlinked, etc.)"
|
|
90
|
+
)
|
glinker/l0/processor.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from typing import Any, List
|
|
2
|
+
from glinker.core.base import BaseProcessor
|
|
3
|
+
from glinker.core.registry import processor_registry
|
|
4
|
+
from .models import L0Config, L0Input, L0Output, L0Entity
|
|
5
|
+
from .component import L0Component
|
|
6
|
+
from glinker.l1.models import L1Entity
|
|
7
|
+
from glinker.l2.models import DatabaseRecord
|
|
8
|
+
from glinker.l3.models import L3Entity
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class L0Processor(BaseProcessor[L0Config, L0Input, L0Output]):
|
|
12
|
+
"""
|
|
13
|
+
L0 aggregation processor - combines outputs from all pipeline layers
|
|
14
|
+
|
|
15
|
+
This processor aggregates information from:
|
|
16
|
+
- L1: Entity mentions (text, position, context)
|
|
17
|
+
- L2: Candidate entities (database records)
|
|
18
|
+
- L3: Linked entities (disambiguation results)
|
|
19
|
+
|
|
20
|
+
Into a unified L0Entity structure showing the full pipeline flow.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
config: L0Config,
|
|
26
|
+
component: L0Component,
|
|
27
|
+
pipeline: list[tuple[str, dict[str, Any]]] = None
|
|
28
|
+
):
|
|
29
|
+
super().__init__(config, component, pipeline)
|
|
30
|
+
self._validate_pipeline()
|
|
31
|
+
self.schema = {} # Will be set by DAG executor if node has schema
|
|
32
|
+
|
|
33
|
+
def _default_pipeline(self) -> list[tuple[str, dict[str, Any]]]:
|
|
34
|
+
return [
|
|
35
|
+
("aggregate", {}),
|
|
36
|
+
("filter_by_confidence", {}),
|
|
37
|
+
("sort_by_confidence", {}),
|
|
38
|
+
("calculate_stats", {})
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
def __call__(
|
|
42
|
+
self,
|
|
43
|
+
l1_entities: List[List[L1Entity]] = None,
|
|
44
|
+
l2_candidates: List[List[DatabaseRecord]] = None,
|
|
45
|
+
l3_entities: List[List[L3Entity]] = None,
|
|
46
|
+
input_data: L0Input = None
|
|
47
|
+
) -> L0Output:
|
|
48
|
+
"""
|
|
49
|
+
Process and aggregate outputs from L1, L2, L3
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
l1_entities: Entities from L1 (mention extraction)
|
|
53
|
+
l2_candidates: Candidates from L2 (database search)
|
|
54
|
+
l3_entities: Entities from L3 (entity linking)
|
|
55
|
+
input_data: Alternative: L0Input with all data
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
L0Output with aggregated entities and statistics
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
# Support both direct params and L0Input
|
|
62
|
+
if input_data is not None:
|
|
63
|
+
l1_entities = input_data.l1_entities
|
|
64
|
+
l2_candidates = input_data.l2_candidates
|
|
65
|
+
l3_entities = input_data.l3_entities
|
|
66
|
+
|
|
67
|
+
if l2_candidates is None or l3_entities is None:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
"Either provide 'l2_candidates' and 'l3_entities' "
|
|
70
|
+
"(and optionally 'l1_entities'), or 'input_data'"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# When L1 is absent (simple pipeline), create empty mention lists
|
|
74
|
+
if l1_entities is None:
|
|
75
|
+
l1_entities = [[] for _ in l3_entities]
|
|
76
|
+
|
|
77
|
+
# Pass schema template to component for matching
|
|
78
|
+
template = self.schema.get('template', '{label}') if self.schema else '{label}'
|
|
79
|
+
|
|
80
|
+
# Execute aggregation pipeline
|
|
81
|
+
aggregated_entities = self.component.aggregate(
|
|
82
|
+
l1_entities, l2_candidates, l3_entities, template=template
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Apply pipeline transformations (filter, sort, etc.)
|
|
86
|
+
results = aggregated_entities
|
|
87
|
+
stats = {}
|
|
88
|
+
|
|
89
|
+
for method_name, kwargs in self.pipeline[1:]: # Skip 'aggregate' as we already did it
|
|
90
|
+
if method_name == "calculate_stats":
|
|
91
|
+
stats = self.component.calculate_stats(results)
|
|
92
|
+
else:
|
|
93
|
+
method = getattr(self.component, method_name)
|
|
94
|
+
results = method(results, **kwargs)
|
|
95
|
+
|
|
96
|
+
# Calculate final stats if not already done
|
|
97
|
+
if not stats:
|
|
98
|
+
stats = self.component.calculate_stats(results)
|
|
99
|
+
|
|
100
|
+
return L0Output(entities=results, stats=stats)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@processor_registry.register("l0_aggregator")
|
|
104
|
+
def create_l0_processor(config_dict: dict, pipeline: list = None) -> L0Processor:
|
|
105
|
+
"""Factory: creates component + processor"""
|
|
106
|
+
config = L0Config(**config_dict)
|
|
107
|
+
component = L0Component(config)
|
|
108
|
+
return L0Processor(config, component, pipeline)
|
glinker/l1/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .models import L1Config, L1GlinerConfig, L1Input, L1Output, L1Entity
|
|
2
|
+
from .component import L1SpacyComponent, L1GlinerComponent
|
|
3
|
+
from .processor import L1SpacyProcessor, L1GlinerProcessor
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"L1Config",
|
|
7
|
+
"L1GlinerConfig",
|
|
8
|
+
"L1Input",
|
|
9
|
+
"L1Output",
|
|
10
|
+
"L1Entity",
|
|
11
|
+
"L1SpacyComponent",
|
|
12
|
+
"L1SpacyProcessor",
|
|
13
|
+
"L1GlinerComponent",
|
|
14
|
+
"L1GlinerProcessor",
|
|
15
|
+
]
|