glinker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,261 @@
1
+ from pathlib import Path
2
+ from typing import Any, Dict, List, Optional, Union
3
+ import yaml
4
+ from .registry import processor_registry
5
+ from .dag import DAGPipeline, DAGExecutor, PipeNode, InputConfig, OutputConfig
6
+
7
+
8
+ def load_yaml(path: str | Path) -> dict:
9
+ """Load YAML configuration file"""
10
+ with open(path, 'r') as f:
11
+ return yaml.safe_load(f)
12
+
13
+
14
+ class ProcessorFactory:
15
+ """Factory for creating pipelines from configs"""
16
+
17
+ @staticmethod
18
+ def create_from_registry(
19
+ processor_name: str,
20
+ config_dict: dict,
21
+ pipeline: list[tuple[str, dict]] = None
22
+ ):
23
+ """
24
+ Create single processor from registry
25
+
26
+ For internal use by DAGExecutor
27
+ """
28
+ factory = processor_registry.get(processor_name)
29
+ return factory(config_dict, pipeline)
30
+
31
+ @staticmethod
32
+ def create_pipeline(config_path: str | Path, verbose: bool = False) -> DAGExecutor:
33
+ """
34
+ Create DAG pipeline from YAML config
35
+
36
+ Supports:
37
+ - Single node (just L2)
38
+ - Multiple nodes (L1 → L2 → L3)
39
+ - Complex DAGs with dependencies
40
+
41
+ Example config:
42
+ name: "my_pipeline"
43
+ nodes:
44
+ - id: "l2"
45
+ processor: "l2_chain"
46
+ inputs:
47
+ mentions: {source: "$input", fields: "mentions"}
48
+ output: {key: "result"}
49
+ config: {...}
50
+ """
51
+ config = load_yaml(config_path)
52
+
53
+ nodes = []
54
+ for node_cfg in config['nodes']:
55
+ inputs = {}
56
+ for name, data in node_cfg['inputs'].items():
57
+ inputs[name] = InputConfig(**data)
58
+
59
+ node = PipeNode(
60
+ id=node_cfg['id'],
61
+ processor=node_cfg['processor'],
62
+ inputs=inputs,
63
+ output=OutputConfig(**node_cfg['output']),
64
+ requires=node_cfg.get('requires', []),
65
+ config=node_cfg['config'],
66
+ schema=node_cfg.get('schema')
67
+ )
68
+ nodes.append(node)
69
+
70
+ pipeline = DAGPipeline(
71
+ name=config['name'],
72
+ description=config.get('description'),
73
+ nodes=nodes
74
+ )
75
+
76
+ return DAGExecutor(pipeline, verbose=verbose)
77
+
78
+ @staticmethod
79
+ def create_from_dict(config_dict: dict, verbose: bool = False) -> DAGExecutor:
80
+ """
81
+ Create pipeline from dict (for programmatic use)
82
+
83
+ Same as create_pipeline but accepts dict instead of file path
84
+ """
85
+ nodes = []
86
+ for node_cfg in config_dict['nodes']:
87
+ inputs = {}
88
+ for name, data in node_cfg['inputs'].items():
89
+ inputs[name] = InputConfig(**data)
90
+
91
+ node = PipeNode(
92
+ id=node_cfg['id'],
93
+ processor=node_cfg['processor'],
94
+ inputs=inputs,
95
+ output=OutputConfig(**node_cfg['output']),
96
+ requires=node_cfg.get('requires', []),
97
+ config=node_cfg['config'],
98
+ schema=node_cfg.get('schema')
99
+ )
100
+ nodes.append(node)
101
+
102
+ pipeline = DAGPipeline(
103
+ name=config_dict['name'],
104
+ description=config_dict.get('description'),
105
+ nodes=nodes
106
+ )
107
+
108
+ return DAGExecutor(pipeline, verbose=verbose)
109
+
110
+ @staticmethod
111
+ def create_simple(
112
+ model_name: str,
113
+ device: str = "cpu",
114
+ threshold: float = 0.5,
115
+ template: str = "{label}",
116
+ max_length: Optional[int] = 512,
117
+ token: Optional[str] = None,
118
+ entities: Optional[Union[str, Path, List[Dict[str, Any]], Dict[str, Dict[str, Any]]]] = None,
119
+ precompute_embeddings: bool = False,
120
+ verbose: bool = False,
121
+ reranker_model: Optional[str] = None,
122
+ reranker_max_labels: int = 20,
123
+ reranker_threshold: Optional[float] = None,
124
+ ) -> DAGExecutor:
125
+ """
126
+ Create a minimal L2 -> L3 -> L0 pipeline from a model name.
127
+
128
+ Skips L1 (mention extraction). L2 serves as an in-memory entity store
129
+ that returns all loaded entities as candidates. L3 runs GLiNER for
130
+ entity linking. L0 aggregates in loose mode.
131
+
132
+ Optionally adds an L4 reranker after L3 for chunked candidate
133
+ re-evaluation when ``reranker_model`` is provided.
134
+
135
+ Args:
136
+ model_name: HuggingFace model ID or local path.
137
+ device: Torch device ("cpu", "cuda", "cuda:0", ...).
138
+ threshold: Minimum score for entity predictions.
139
+ template: Format string for entity labels (e.g. "{label}: {description}").
140
+ max_length: Max sequence length for tokenization.
141
+ token: HuggingFace auth token for gated models.
142
+ entities: Optional entity data to load immediately. Accepts a file
143
+ path (str/Path to JSONL), a list of dicts, or a dict mapping
144
+ entity_id to entity data.
145
+ precompute_embeddings: If True and *entities* are provided,
146
+ pre-embed all entity labels after loading (BiEncoder models only).
147
+ verbose: Enable verbose logging.
148
+ reranker_model: Optional GLiNER model for L4 reranking. When set,
149
+ an L4 node is added after L3.
150
+ reranker_max_labels: Max candidate labels per L4 inference call.
151
+ reranker_threshold: Score threshold for L4. Defaults to *threshold*.
152
+
153
+ Returns:
154
+ Configured DAGExecutor ready for ``execute``.
155
+ """
156
+ nodes = [
157
+ {
158
+ "id": "l2",
159
+ "processor": "l2_chain",
160
+ "requires": [],
161
+ "inputs": {
162
+ "texts": {"source": "$input", "fields": "texts"},
163
+ },
164
+ "output": {"key": "l2_result"},
165
+ "schema": {"template": template},
166
+ "config": {
167
+ "max_candidates": 30,
168
+ "min_popularity": 0,
169
+ "layers": [
170
+ {
171
+ "type": "dict",
172
+ "priority": 0,
173
+ "write": True,
174
+ "search_mode": ["exact"],
175
+ }
176
+ ],
177
+ },
178
+ },
179
+ {
180
+ "id": "l3",
181
+ "processor": "l3_batch",
182
+ "requires": ["l2"],
183
+ "inputs": {
184
+ "texts": {"source": "$input", "fields": "texts"},
185
+ "candidates": {"source": "l2_result", "fields": "candidates"},
186
+ },
187
+ "output": {"key": "l3_result"},
188
+ "schema": {"template": template},
189
+ "config": {
190
+ "model_name": model_name,
191
+ "device": device,
192
+ "threshold": threshold,
193
+ "flat_ner": True,
194
+ "multi_label": False,
195
+ "use_precomputed_embeddings": True,
196
+ "cache_embeddings": False,
197
+ "max_length": max_length,
198
+ "token": token,
199
+ },
200
+ },
201
+ ]
202
+
203
+ l0_entity_source = "l3_result"
204
+ l0_requires = ["l2", "l3"]
205
+
206
+ if reranker_model:
207
+ nodes.append({
208
+ "id": "l4",
209
+ "processor": "l4_reranker",
210
+ "requires": ["l2", "l3"],
211
+ "inputs": {
212
+ "texts": {"source": "$input", "fields": "texts"},
213
+ "candidates": {"source": "l2_result", "fields": "candidates"},
214
+ },
215
+ "output": {"key": "l4_result"},
216
+ "schema": {"template": template},
217
+ "config": {
218
+ "model_name": reranker_model,
219
+ "device": device,
220
+ "threshold": reranker_threshold if reranker_threshold is not None else threshold,
221
+ "flat_ner": True,
222
+ "multi_label": False,
223
+ "max_labels": reranker_max_labels,
224
+ "max_length": max_length,
225
+ "token": token,
226
+ },
227
+ })
228
+ l0_entity_source = "l4_result"
229
+ l0_requires.append("l4")
230
+
231
+ nodes.append({
232
+ "id": "l0",
233
+ "processor": "l0_aggregator",
234
+ "requires": l0_requires,
235
+ "inputs": {
236
+ "l2_candidates": {"source": "l2_result", "fields": "candidates"},
237
+ "l3_entities": {"source": l0_entity_source, "fields": "entities"},
238
+ },
239
+ "output": {"key": "l0_result"},
240
+ "schema": {"template": template},
241
+ "config": {
242
+ "strict_matching": False,
243
+ "min_confidence": 0.0,
244
+ "include_unlinked": True,
245
+ "position_tolerance": 2,
246
+ },
247
+ })
248
+
249
+ config = {
250
+ "name": "simple",
251
+ "description": "Simple pipeline - L3 only with entity database",
252
+ "nodes": nodes,
253
+ }
254
+ executor = ProcessorFactory.create_from_dict(config, verbose=verbose)
255
+
256
+ if entities is not None:
257
+ executor.load_entities(entities)
258
+ if precompute_embeddings:
259
+ executor.precompute_embeddings()
260
+
261
+ return executor
@@ -0,0 +1,31 @@
1
+ from typing import Dict, Callable
2
+
3
+
4
+ class ProcessorRegistry:
5
+ """Registry for processor factory functions"""
6
+
7
+ def __init__(self):
8
+ self._registry: Dict[str, Callable] = {}
9
+
10
+ def register(self, name: str):
11
+ """Decorator to register processor factory"""
12
+ def decorator(factory: Callable):
13
+ self._registry[name] = factory
14
+ return factory
15
+ return decorator
16
+
17
+ def get(self, name: str) -> Callable:
18
+ """Get processor factory by name"""
19
+ if name not in self._registry:
20
+ raise KeyError(
21
+ f"Processor '{name}' not found. "
22
+ f"Available: {list(self._registry.keys())}"
23
+ )
24
+ return self._registry[name]
25
+
26
+ def list_available(self) -> list[str]:
27
+ """List all registered processor names"""
28
+ return list(self._registry.keys())
29
+
30
+
31
+ processor_registry = ProcessorRegistry()
glinker/l0/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ """
2
+ L0 - Aggregation Layer
3
+
4
+ Combines outputs from L1 (mention extraction), L2 (candidate retrieval),
5
+ and L3 (entity linking) into unified L0Entity structures with full pipeline context.
6
+ """
7
+
8
+ from .models import L0Config, L0Input, L0Output, L0Entity, LinkedEntity
9
+ from .component import L0Component
10
+ from .processor import L0Processor, create_l0_processor
11
+
12
+ __all__ = [
13
+ "L0Config",
14
+ "L0Input",
15
+ "L0Output",
16
+ "L0Entity",
17
+ "LinkedEntity",
18
+ "L0Component",
19
+ "L0Processor",
20
+ "create_l0_processor"
21
+ ]