glinker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,472 @@
1
+ from typing import List, Optional, Dict, Tuple
2
+ from glinker.core.base import BaseComponent
3
+ from .models import (
4
+ L0Config, L0Entity, LinkedEntity
5
+ )
6
+ from glinker.l1.models import L1Entity
7
+ from glinker.l2.models import DatabaseRecord
8
+ from glinker.l3.models import L3Entity
9
+
10
+
11
+ class L0Component(BaseComponent[L0Config]):
12
+ """
13
+ L0 aggregation component - combines outputs from L1, L2, L3
14
+
15
+ Workflow:
16
+ 1. For each L1 mention → find its L2 candidates
17
+ 2. For each L1 mention → check if it was linked in L3
18
+ 3. Create L0Entity with full information from all layers
19
+ """
20
+
21
+ def get_available_methods(self) -> List[str]:
22
+ return [
23
+ "aggregate",
24
+ "filter_by_confidence",
25
+ "sort_by_confidence",
26
+ "calculate_stats"
27
+ ]
28
+
29
+ def aggregate(
30
+ self,
31
+ l1_entities: List[List[L1Entity]],
32
+ l2_candidates: List[List[DatabaseRecord]],
33
+ l3_entities: List[List[L3Entity]],
34
+ template: str = "{label}"
35
+ ) -> List[List[L0Entity]]:
36
+ """
37
+ Main aggregation method - combines all layers
38
+
39
+ Args:
40
+ l1_entities: [[L1Entity, ...], ...] - one list per text
41
+ l2_candidates: [[DatabaseRecord, ...], ...] - one list per text
42
+ l3_entities: [[L3Entity, ...], ...] - one list per text
43
+ template: Label template from L3 schema (e.g., "{label} {description}")
44
+
45
+ Returns:
46
+ [[L0Entity, ...], ...] - aggregated entities per text
47
+ """
48
+ all_results = []
49
+
50
+ # Process each text separately
51
+ for text_idx in range(len(l1_entities)):
52
+ l1_mentions = l1_entities[text_idx] if text_idx < len(l1_entities) else []
53
+ l2_cands = l2_candidates[text_idx] if text_idx < len(l2_candidates) else []
54
+ l3_links = l3_entities[text_idx] if text_idx < len(l3_entities) else []
55
+
56
+ text_results = self._aggregate_single_text(l1_mentions, l2_cands, l3_links, template)
57
+ all_results.append(text_results)
58
+
59
+ return all_results
60
+
61
+ def _aggregate_single_text(
62
+ self,
63
+ l1_mentions: List[L1Entity],
64
+ l2_candidates: List[DatabaseRecord],
65
+ l3_links: List[L3Entity],
66
+ template: str = "{label}"
67
+ ) -> List[L0Entity]:
68
+ """
69
+ Aggregate data for a single text
70
+
71
+ Strategy:
72
+ 1. Build index of L3 linked entities by position
73
+ 2. For each L1 mention:
74
+ - Find corresponding candidates from L2
75
+ - Check if it was linked in L3
76
+ - Create L0Entity
77
+ 3. If strict_matching=False, also include L3 entities outside L1 mentions
78
+ """
79
+ # Build L3 index by position
80
+ l3_by_position = self._build_l3_index(l3_links)
81
+
82
+ results = []
83
+ used_l3_positions = set() # Track which L3 entities were matched to L1 mentions
84
+
85
+ for mention_idx, l1_mention in enumerate(l1_mentions):
86
+ # Get candidates for this mention (L2 returns flat list, need to group)
87
+ # Assuming L2 candidates are in the same order as L1 mentions
88
+ mention_candidates = self._get_candidates_for_mention(
89
+ mention_idx, l1_mention, l2_candidates
90
+ )
91
+
92
+ # Check if this mention was linked in L3
93
+ linked_entity, l3_pos = self._find_linked_entity_with_position(
94
+ l1_mention, l3_by_position, mention_candidates, template,
95
+ tolerance=self.config.position_tolerance
96
+ )
97
+
98
+ if l3_pos:
99
+ used_l3_positions.add(l3_pos)
100
+
101
+ # Build candidate_scores from L3 class_probs
102
+ candidate_scores = {}
103
+ l3_entity = l3_by_position.get(l3_pos) if l3_pos else None
104
+ if l3_entity and l3_entity.class_probs:
105
+ candidate_scores = self._build_candidate_scores(
106
+ l3_entity.class_probs, l2_candidates, template
107
+ )
108
+
109
+ # Determine pipeline stage
110
+ pipeline_stage = self._determine_stage(mention_candidates, linked_entity)
111
+
112
+ # Create L0Entity
113
+ l0_entity = L0Entity(
114
+ mention_text=l1_mention.text,
115
+ mention_start=l1_mention.start,
116
+ mention_end=l1_mention.end,
117
+ left_context=l1_mention.left_context,
118
+ right_context=l1_mention.right_context,
119
+ candidates=mention_candidates,
120
+ num_candidates=len(mention_candidates),
121
+ linked_entity=linked_entity,
122
+ is_linked=linked_entity is not None,
123
+ candidate_scores=candidate_scores,
124
+ pipeline_stage=pipeline_stage
125
+ )
126
+
127
+ results.append(l0_entity)
128
+
129
+ # If loose mode, include L3 entities that weren't matched to L1 mentions
130
+ if not self.config.strict_matching:
131
+ for (l3_start, l3_end), l3_entity in l3_by_position.items():
132
+ if (l3_start, l3_end) not in used_l3_positions:
133
+ # This L3 entity was not matched to any L1 mention
134
+ # Find candidate by label
135
+ matched_candidate = self._match_candidate_by_label(
136
+ l3_entity.label, l2_candidates, template
137
+ )
138
+
139
+ # Build candidate_scores from class_probs
140
+ candidate_scores = {}
141
+ if l3_entity.class_probs:
142
+ candidate_scores = self._build_candidate_scores(
143
+ l3_entity.class_probs, l2_candidates, template
144
+ )
145
+
146
+ linked = LinkedEntity(
147
+ entity_id=matched_candidate.entity_id if matched_candidate else "unknown",
148
+ label=matched_candidate.label if matched_candidate else l3_entity.label,
149
+ confidence=l3_entity.score,
150
+ start=l3_entity.start,
151
+ end=l3_entity.end,
152
+ matched_text=l3_entity.text
153
+ )
154
+
155
+ l0_entity = L0Entity(
156
+ mention_text=l3_entity.text,
157
+ mention_start=l3_entity.start,
158
+ mention_end=l3_entity.end,
159
+ left_context="", # No context from L1
160
+ right_context="",
161
+ candidates=[matched_candidate] if matched_candidate else [],
162
+ num_candidates=1 if matched_candidate else 0,
163
+ linked_entity=linked,
164
+ is_linked=True,
165
+ candidate_scores=candidate_scores,
166
+ pipeline_stage="l3_only" # Indicates L3 found it without L1
167
+ )
168
+ results.append(l0_entity)
169
+
170
+ return results
171
+
172
+ def _build_l3_index(self, l3_links: List[L3Entity]) -> Dict[Tuple[int, int], L3Entity]:
173
+ """Build index of L3 entities by (start, end) position"""
174
+ index = {}
175
+ for entity in l3_links:
176
+ key = (entity.start, entity.end)
177
+ index[key] = entity
178
+ return index
179
+
180
+ def _get_candidates_for_mention(
181
+ self,
182
+ mention_idx: int,
183
+ l1_mention: L1Entity,
184
+ all_candidates: List[DatabaseRecord]
185
+ ) -> List[DatabaseRecord]:
186
+ """
187
+ Get candidates for specific mention
188
+
189
+ Note: L2 returns candidates grouped per text. We need to match by text content.
190
+ """
191
+ matched_candidates = []
192
+
193
+ # Match candidates by mention text (normalize)
194
+ mention_text_lower = l1_mention.text.lower().strip()
195
+
196
+ for candidate in all_candidates:
197
+ # Check if candidate matches this mention
198
+ if candidate.label.lower().strip() == mention_text_lower:
199
+ matched_candidates.append(candidate)
200
+ continue
201
+
202
+ # Check aliases
203
+ for alias in candidate.aliases:
204
+ if alias.lower().strip() == mention_text_lower:
205
+ matched_candidates.append(candidate)
206
+ break
207
+
208
+ return matched_candidates
209
+
210
+ def _find_linked_entity(
211
+ self,
212
+ l1_mention: L1Entity,
213
+ l3_by_position: Dict[Tuple[int, int], L3Entity],
214
+ candidates: List[DatabaseRecord],
215
+ template: str = "{label}"
216
+ ) -> Optional[LinkedEntity]:
217
+ """
218
+ Find if this L1 mention was linked in L3
219
+
220
+ Strategy:
221
+ 1. Look up L3 entity by position (start, end)
222
+ 2. If found, match with candidates to get entity_id
223
+ 3. Return LinkedEntity with full information
224
+ """
225
+ linked, _ = self._find_linked_entity_with_position(
226
+ l1_mention, l3_by_position, candidates, template
227
+ )
228
+ return linked
229
+
230
+ def _find_linked_entity_with_position(
231
+ self,
232
+ l1_mention: L1Entity,
233
+ l3_by_position: Dict[Tuple[int, int], L3Entity],
234
+ candidates: List[DatabaseRecord],
235
+ template: str = "{label}",
236
+ tolerance: int = 2
237
+ ) -> Tuple[Optional[LinkedEntity], Optional[Tuple[int, int]]]:
238
+ """
239
+ Find if this L1 mention was linked in L3, and return the matched position
240
+
241
+ Returns:
242
+ Tuple of (LinkedEntity or None, matched position tuple or None)
243
+ """
244
+ # Try exact position match
245
+ key = (l1_mention.start, l1_mention.end)
246
+ l3_entity = l3_by_position.get(key)
247
+ matched_key = key if l3_entity else None
248
+
249
+ if not l3_entity:
250
+ # Try fuzzy position match (text might be slightly different)
251
+ l3_entity, matched_key = self._fuzzy_position_match_with_key(
252
+ l1_mention.start, l1_mention.end, l3_by_position, tolerance
253
+ )
254
+
255
+ if not l3_entity:
256
+ return None, None
257
+
258
+ # Find matching candidate by label using template
259
+ matched_candidate = self._match_candidate_by_label(l3_entity.label, candidates, template)
260
+
261
+ if not matched_candidate:
262
+ # L3 found entity but no matching candidate - shouldn't happen but handle gracefully
263
+ return LinkedEntity(
264
+ entity_id="unknown",
265
+ label=l3_entity.label,
266
+ confidence=l3_entity.score,
267
+ start=l3_entity.start,
268
+ end=l3_entity.end,
269
+ matched_text=l3_entity.text
270
+ ), matched_key
271
+
272
+ return LinkedEntity(
273
+ entity_id=matched_candidate.entity_id,
274
+ label=matched_candidate.label,
275
+ confidence=l3_entity.score,
276
+ start=l3_entity.start,
277
+ end=l3_entity.end,
278
+ matched_text=l3_entity.text
279
+ ), matched_key
280
+
281
+ def _fuzzy_position_match(
282
+ self,
283
+ start: int,
284
+ end: int,
285
+ l3_by_position: Dict[Tuple[int, int], L3Entity],
286
+ tolerance: int = 2
287
+ ) -> Optional[L3Entity]:
288
+ """Find L3 entity with position close to given range"""
289
+ entity, _ = self._fuzzy_position_match_with_key(start, end, l3_by_position, tolerance)
290
+ return entity
291
+
292
+ def _fuzzy_position_match_with_key(
293
+ self,
294
+ start: int,
295
+ end: int,
296
+ l3_by_position: Dict[Tuple[int, int], L3Entity],
297
+ tolerance: int = 2
298
+ ) -> Tuple[Optional[L3Entity], Optional[Tuple[int, int]]]:
299
+ """Find L3 entity with position close to given range, return with its key"""
300
+ for (l3_start, l3_end), entity in l3_by_position.items():
301
+ if abs(l3_start - start) <= tolerance and abs(l3_end - end) <= tolerance:
302
+ return entity, (l3_start, l3_end)
303
+ return None, None
304
+
305
+ def _build_candidate_scores(
306
+ self,
307
+ class_probs: Dict[str, float],
308
+ candidates: List[DatabaseRecord],
309
+ template: str = "{label}"
310
+ ) -> Dict[str, float]:
311
+ """
312
+ Map L3 class_probs (label -> probability) to candidate entity_ids.
313
+
314
+ Args:
315
+ class_probs: Dict of label -> probability from L3 entity
316
+ candidates: L2 candidate records
317
+ template: Schema template used to format labels in L3
318
+
319
+ Returns:
320
+ Dict of entity_id -> probability
321
+ """
322
+ scores = {}
323
+ for label, prob in class_probs.items():
324
+ matched = self._match_candidate_by_label(label, candidates, template)
325
+ if matched:
326
+ scores[matched.entity_id] = prob
327
+ return scores
328
+
329
+ def _match_candidate_by_label(
330
+ self,
331
+ l3_label: str,
332
+ candidates: List[DatabaseRecord],
333
+ template: str = "{label}"
334
+ ) -> Optional[DatabaseRecord]:
335
+ """
336
+ Match L3 label with L2 candidate using the same template
337
+
338
+ Uses the schema template to format candidate labels the same way L3 did,
339
+ enabling exact matching.
340
+
341
+ Example:
342
+ template = "{label} {description}"
343
+ L3 label = "TP53 Tumor suppressor gene..."
344
+ candidate formatted = "TP53 Tumor suppressor gene..." -> MATCH!
345
+
346
+ Args:
347
+ l3_label: Label from L3 entity (formatted with template)
348
+ candidates: List of candidates from L2
349
+ template: Template string (e.g., "{label} {description}")
350
+
351
+ Returns:
352
+ Matched DatabaseRecord or None
353
+ """
354
+ l3_label_lower = l3_label.lower().strip()
355
+
356
+ # Try to match by formatting each candidate with the template
357
+ for candidate in candidates:
358
+ try:
359
+ # Format candidate using same template as L3
360
+ if hasattr(candidate, 'dict'):
361
+ cand_dict = candidate.dict()
362
+ else:
363
+ cand_dict = {
364
+ 'label': candidate.label,
365
+ 'description': getattr(candidate, 'description', ''),
366
+ 'entity_id': getattr(candidate, 'entity_id', ''),
367
+ 'entity_type': getattr(candidate, 'entity_type', ''),
368
+ 'popularity': getattr(candidate, 'popularity', 0),
369
+ 'aliases': getattr(candidate, 'aliases', [])
370
+ }
371
+
372
+ formatted_label = template.format(**cand_dict)
373
+
374
+ if formatted_label.lower().strip() == l3_label_lower:
375
+ return candidate
376
+
377
+ except (KeyError, AttributeError):
378
+ # Template formatting failed, try simple label match
379
+ if candidate.label.lower().strip() == l3_label_lower:
380
+ return candidate
381
+
382
+ # Fallback: try simple contains match (for robustness)
383
+ for candidate in candidates:
384
+ cand_label_lower = candidate.label.lower().strip()
385
+ if cand_label_lower and cand_label_lower in l3_label_lower:
386
+ return candidate
387
+
388
+ return None
389
+
390
+ def _determine_stage(
391
+ self,
392
+ candidates: List[DatabaseRecord],
393
+ linked_entity: Optional[LinkedEntity]
394
+ ) -> str:
395
+ """Determine which pipeline stage was last successful"""
396
+ if linked_entity:
397
+ return "l3_linked"
398
+ elif candidates:
399
+ return "l2_found"
400
+ else:
401
+ return "l1_only"
402
+
403
+ def filter_by_confidence(
404
+ self,
405
+ entities: List[List[L0Entity]],
406
+ min_confidence: float = None
407
+ ) -> List[List[L0Entity]]:
408
+ """Filter entities by linking confidence"""
409
+ threshold = min_confidence if min_confidence is not None else self.config.min_confidence
410
+
411
+ filtered = []
412
+ for text_entities in entities:
413
+ filtered_text = [
414
+ e for e in text_entities
415
+ if e.linked_entity and e.linked_entity.confidence >= threshold
416
+ ]
417
+ filtered.append(filtered_text)
418
+
419
+ return filtered
420
+
421
+ def sort_by_confidence(self, entities: List[List[L0Entity]]) -> List[List[L0Entity]]:
422
+ """Sort entities by linking confidence (descending)"""
423
+ sorted_results = []
424
+ for text_entities in entities:
425
+ sorted_text = sorted(
426
+ text_entities,
427
+ key=lambda e: e.linked_entity.confidence if e.linked_entity else 0.0,
428
+ reverse=True
429
+ )
430
+ sorted_results.append(sorted_text)
431
+ return sorted_results
432
+
433
+ def calculate_stats(self, entities: List[List[L0Entity]]) -> dict:
434
+ """Calculate pipeline statistics"""
435
+ total = 0
436
+ linked = 0
437
+ unlinked = 0
438
+ l1_only = 0
439
+ l2_found = 0
440
+ l3_linked = 0
441
+ l3_only = 0 # L3 entities found without L1 mentions (loose mode)
442
+
443
+ for text_entities in entities:
444
+ for entity in text_entities:
445
+ total += 1
446
+
447
+ if entity.is_linked:
448
+ linked += 1
449
+ else:
450
+ unlinked += 1
451
+
452
+ if entity.pipeline_stage == "l1_only":
453
+ l1_only += 1
454
+ elif entity.pipeline_stage == "l2_found":
455
+ l2_found += 1
456
+ elif entity.pipeline_stage == "l3_linked":
457
+ l3_linked += 1
458
+ elif entity.pipeline_stage == "l3_only":
459
+ l3_only += 1
460
+
461
+ return {
462
+ "total_mentions": total,
463
+ "linked": linked,
464
+ "unlinked": unlinked,
465
+ "linking_rate": linked / total if total > 0 else 0.0,
466
+ "stages": {
467
+ "l1_only": l1_only,
468
+ "l2_found": l2_found,
469
+ "l3_linked": l3_linked,
470
+ "l3_only": l3_only
471
+ }
472
+ }
glinker/l0/models.py ADDED
@@ -0,0 +1,90 @@
1
+ from pydantic import Field
2
+ from typing import Dict, List, Optional
3
+ from glinker.core.base import BaseConfig, BaseInput, BaseOutput
4
+ from glinker.l1.models import L1Entity
5
+ from glinker.l2.models import DatabaseRecord
6
+ from glinker.l3.models import L3Entity
7
+
8
+
9
+ class L0Config(BaseConfig):
10
+ """L0 aggregation configuration"""
11
+ min_confidence: float = Field(0.0, description="Minimum confidence threshold for linked entities")
12
+ include_unlinked: bool = Field(True, description="Include mentions without linked entities")
13
+ return_all_candidates: bool = Field(False, description="Return all candidates or only top match")
14
+ strict_matching: bool = Field(
15
+ True,
16
+ description="If True, only include entities that match L1 mentions. "
17
+ "If False, also include L3 entities found outside L1 mentions."
18
+ )
19
+ position_tolerance: int = Field(
20
+ 2,
21
+ description="Maximum character difference for fuzzy position matching between L1 and L3 entities"
22
+ )
23
+
24
+
25
+ class L0Input(BaseInput):
26
+ """L0 processor input - outputs from L1, L2, L3"""
27
+ l1_entities: List[List[L1Entity]] = Field(..., description="Entities from L1 (per text)")
28
+ l2_candidates: List[List[DatabaseRecord]] = Field(..., description="Candidates from L2 (per mention)")
29
+ l3_entities: List[List[L3Entity]] = Field(..., description="Linked entities from L3 (per text)")
30
+
31
+
32
+ class LinkedEntity(BaseOutput):
33
+ """Linked entity information from L3"""
34
+ entity_id: str = Field(..., description="Entity ID from matched candidate")
35
+ label: str = Field(..., description="Entity label")
36
+ confidence: float = Field(..., description="Linking confidence score from L3")
37
+ start: int = Field(..., description="Start position in text")
38
+ end: int = Field(..., description="End position in text")
39
+ matched_text: str = Field(..., description="Matched text from L3")
40
+
41
+
42
+ class L0Entity(BaseOutput):
43
+ """
44
+ Aggregated entity combining information from all layers:
45
+ - L1: mention detection (text, position, context)
46
+ - L2: candidates (entity database records)
47
+ - L3: disambiguation (linked entity with confidence)
48
+ """
49
+ # From L1 - mention detection
50
+ mention_text: str = Field(..., description="Extracted mention text from L1")
51
+ mention_start: int = Field(..., description="Start position in original text")
52
+ mention_end: int = Field(..., description="End position in original text")
53
+ left_context: str = Field(..., description="Left context from L1")
54
+ right_context: str = Field(..., description="Right context from L1")
55
+
56
+ # From L2 - candidate retrieval
57
+ candidates: List[DatabaseRecord] = Field(
58
+ default_factory=list,
59
+ description="All candidates found in L2 for this mention"
60
+ )
61
+ num_candidates: int = Field(0, description="Number of candidates found")
62
+
63
+ # From L3 - entity linking
64
+ linked_entity: Optional[LinkedEntity] = Field(
65
+ None,
66
+ description="Linked entity if disambiguation was successful"
67
+ )
68
+ is_linked: bool = Field(False, description="Whether entity was successfully linked")
69
+ candidate_scores: Dict[str, float] = Field(
70
+ default_factory=dict,
71
+ description="L3 class probability per candidate entity_id"
72
+ )
73
+
74
+ # Aggregated metadata
75
+ pipeline_stage: str = Field(
76
+ "",
77
+ description="Last successful stage: 'l1_only', 'l2_found', 'l3_linked'"
78
+ )
79
+
80
+
81
+ class L0Output(BaseOutput):
82
+ """L0 processor output"""
83
+ entities: List[List[L0Entity]] = Field(
84
+ ...,
85
+ description="Aggregated entities per text with full pipeline information"
86
+ )
87
+ stats: dict = Field(
88
+ default_factory=dict,
89
+ description="Pipeline statistics (total, linked, unlinked, etc.)"
90
+ )
@@ -0,0 +1,108 @@
1
+ from typing import Any, List
2
+ from glinker.core.base import BaseProcessor
3
+ from glinker.core.registry import processor_registry
4
+ from .models import L0Config, L0Input, L0Output, L0Entity
5
+ from .component import L0Component
6
+ from glinker.l1.models import L1Entity
7
+ from glinker.l2.models import DatabaseRecord
8
+ from glinker.l3.models import L3Entity
9
+
10
+
11
+ class L0Processor(BaseProcessor[L0Config, L0Input, L0Output]):
12
+ """
13
+ L0 aggregation processor - combines outputs from all pipeline layers
14
+
15
+ This processor aggregates information from:
16
+ - L1: Entity mentions (text, position, context)
17
+ - L2: Candidate entities (database records)
18
+ - L3: Linked entities (disambiguation results)
19
+
20
+ Into a unified L0Entity structure showing the full pipeline flow.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ config: L0Config,
26
+ component: L0Component,
27
+ pipeline: list[tuple[str, dict[str, Any]]] = None
28
+ ):
29
+ super().__init__(config, component, pipeline)
30
+ self._validate_pipeline()
31
+ self.schema = {} # Will be set by DAG executor if node has schema
32
+
33
+ def _default_pipeline(self) -> list[tuple[str, dict[str, Any]]]:
34
+ return [
35
+ ("aggregate", {}),
36
+ ("filter_by_confidence", {}),
37
+ ("sort_by_confidence", {}),
38
+ ("calculate_stats", {})
39
+ ]
40
+
41
+ def __call__(
42
+ self,
43
+ l1_entities: List[List[L1Entity]] = None,
44
+ l2_candidates: List[List[DatabaseRecord]] = None,
45
+ l3_entities: List[List[L3Entity]] = None,
46
+ input_data: L0Input = None
47
+ ) -> L0Output:
48
+ """
49
+ Process and aggregate outputs from L1, L2, L3
50
+
51
+ Args:
52
+ l1_entities: Entities from L1 (mention extraction)
53
+ l2_candidates: Candidates from L2 (database search)
54
+ l3_entities: Entities from L3 (entity linking)
55
+ input_data: Alternative: L0Input with all data
56
+
57
+ Returns:
58
+ L0Output with aggregated entities and statistics
59
+ """
60
+
61
+ # Support both direct params and L0Input
62
+ if input_data is not None:
63
+ l1_entities = input_data.l1_entities
64
+ l2_candidates = input_data.l2_candidates
65
+ l3_entities = input_data.l3_entities
66
+
67
+ if l2_candidates is None or l3_entities is None:
68
+ raise ValueError(
69
+ "Either provide 'l2_candidates' and 'l3_entities' "
70
+ "(and optionally 'l1_entities'), or 'input_data'"
71
+ )
72
+
73
+ # When L1 is absent (simple pipeline), create empty mention lists
74
+ if l1_entities is None:
75
+ l1_entities = [[] for _ in l3_entities]
76
+
77
+ # Pass schema template to component for matching
78
+ template = self.schema.get('template', '{label}') if self.schema else '{label}'
79
+
80
+ # Execute aggregation pipeline
81
+ aggregated_entities = self.component.aggregate(
82
+ l1_entities, l2_candidates, l3_entities, template=template
83
+ )
84
+
85
+ # Apply pipeline transformations (filter, sort, etc.)
86
+ results = aggregated_entities
87
+ stats = {}
88
+
89
+ for method_name, kwargs in self.pipeline[1:]: # Skip 'aggregate' as we already did it
90
+ if method_name == "calculate_stats":
91
+ stats = self.component.calculate_stats(results)
92
+ else:
93
+ method = getattr(self.component, method_name)
94
+ results = method(results, **kwargs)
95
+
96
+ # Calculate final stats if not already done
97
+ if not stats:
98
+ stats = self.component.calculate_stats(results)
99
+
100
+ return L0Output(entities=results, stats=stats)
101
+
102
+
103
+ @processor_registry.register("l0_aggregator")
104
+ def create_l0_processor(config_dict: dict, pipeline: list = None) -> L0Processor:
105
+ """Factory: creates component + processor"""
106
+ config = L0Config(**config_dict)
107
+ component = L0Component(config)
108
+ return L0Processor(config, component, pipeline)
glinker/l1/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ from .models import L1Config, L1GlinerConfig, L1Input, L1Output, L1Entity
2
+ from .component import L1SpacyComponent, L1GlinerComponent
3
+ from .processor import L1SpacyProcessor, L1GlinerProcessor
4
+
5
+ __all__ = [
6
+ "L1Config",
7
+ "L1GlinerConfig",
8
+ "L1Input",
9
+ "L1Output",
10
+ "L1Entity",
11
+ "L1SpacyComponent",
12
+ "L1SpacyProcessor",
13
+ "L1GlinerComponent",
14
+ "L1GlinerProcessor",
15
+ ]