glinker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glinker/__init__.py +54 -0
- glinker/core/__init__.py +56 -0
- glinker/core/base.py +103 -0
- glinker/core/builders.py +547 -0
- glinker/core/dag.py +898 -0
- glinker/core/factory.py +261 -0
- glinker/core/registry.py +31 -0
- glinker/l0/__init__.py +21 -0
- glinker/l0/component.py +472 -0
- glinker/l0/models.py +90 -0
- glinker/l0/processor.py +108 -0
- glinker/l1/__init__.py +15 -0
- glinker/l1/component.py +284 -0
- glinker/l1/models.py +47 -0
- glinker/l1/processor.py +152 -0
- glinker/l2/__init__.py +19 -0
- glinker/l2/component.py +1220 -0
- glinker/l2/models.py +99 -0
- glinker/l2/processor.py +170 -0
- glinker/l3/__init__.py +12 -0
- glinker/l3/component.py +184 -0
- glinker/l3/models.py +48 -0
- glinker/l3/processor.py +350 -0
- glinker/l4/__init__.py +9 -0
- glinker/l4/component.py +121 -0
- glinker/l4/models.py +21 -0
- glinker/l4/processor.py +156 -0
- glinker/py.typed +1 -0
- glinker-0.1.0.dist-info/METADATA +994 -0
- glinker-0.1.0.dist-info/RECORD +33 -0
- glinker-0.1.0.dist-info/WHEEL +5 -0
- glinker-0.1.0.dist-info/licenses/LICENSE +201 -0
- glinker-0.1.0.dist-info/top_level.txt +1 -0
glinker/core/factory.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
import yaml
|
|
4
|
+
from .registry import processor_registry
|
|
5
|
+
from .dag import DAGPipeline, DAGExecutor, PipeNode, InputConfig, OutputConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def load_yaml(path: str | Path) -> dict:
|
|
9
|
+
"""Load YAML configuration file"""
|
|
10
|
+
with open(path, 'r') as f:
|
|
11
|
+
return yaml.safe_load(f)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ProcessorFactory:
|
|
15
|
+
"""Factory for creating pipelines from configs"""
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def create_from_registry(
|
|
19
|
+
processor_name: str,
|
|
20
|
+
config_dict: dict,
|
|
21
|
+
pipeline: list[tuple[str, dict]] = None
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Create single processor from registry
|
|
25
|
+
|
|
26
|
+
For internal use by DAGExecutor
|
|
27
|
+
"""
|
|
28
|
+
factory = processor_registry.get(processor_name)
|
|
29
|
+
return factory(config_dict, pipeline)
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def create_pipeline(config_path: str | Path, verbose: bool = False) -> DAGExecutor:
|
|
33
|
+
"""
|
|
34
|
+
Create DAG pipeline from YAML config
|
|
35
|
+
|
|
36
|
+
Supports:
|
|
37
|
+
- Single node (just L2)
|
|
38
|
+
- Multiple nodes (L1 → L2 → L3)
|
|
39
|
+
- Complex DAGs with dependencies
|
|
40
|
+
|
|
41
|
+
Example config:
|
|
42
|
+
name: "my_pipeline"
|
|
43
|
+
nodes:
|
|
44
|
+
- id: "l2"
|
|
45
|
+
processor: "l2_chain"
|
|
46
|
+
inputs:
|
|
47
|
+
mentions: {source: "$input", fields: "mentions"}
|
|
48
|
+
output: {key: "result"}
|
|
49
|
+
config: {...}
|
|
50
|
+
"""
|
|
51
|
+
config = load_yaml(config_path)
|
|
52
|
+
|
|
53
|
+
nodes = []
|
|
54
|
+
for node_cfg in config['nodes']:
|
|
55
|
+
inputs = {}
|
|
56
|
+
for name, data in node_cfg['inputs'].items():
|
|
57
|
+
inputs[name] = InputConfig(**data)
|
|
58
|
+
|
|
59
|
+
node = PipeNode(
|
|
60
|
+
id=node_cfg['id'],
|
|
61
|
+
processor=node_cfg['processor'],
|
|
62
|
+
inputs=inputs,
|
|
63
|
+
output=OutputConfig(**node_cfg['output']),
|
|
64
|
+
requires=node_cfg.get('requires', []),
|
|
65
|
+
config=node_cfg['config'],
|
|
66
|
+
schema=node_cfg.get('schema')
|
|
67
|
+
)
|
|
68
|
+
nodes.append(node)
|
|
69
|
+
|
|
70
|
+
pipeline = DAGPipeline(
|
|
71
|
+
name=config['name'],
|
|
72
|
+
description=config.get('description'),
|
|
73
|
+
nodes=nodes
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return DAGExecutor(pipeline, verbose=verbose)
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def create_from_dict(config_dict: dict, verbose: bool = False) -> DAGExecutor:
|
|
80
|
+
"""
|
|
81
|
+
Create pipeline from dict (for programmatic use)
|
|
82
|
+
|
|
83
|
+
Same as create_pipeline but accepts dict instead of file path
|
|
84
|
+
"""
|
|
85
|
+
nodes = []
|
|
86
|
+
for node_cfg in config_dict['nodes']:
|
|
87
|
+
inputs = {}
|
|
88
|
+
for name, data in node_cfg['inputs'].items():
|
|
89
|
+
inputs[name] = InputConfig(**data)
|
|
90
|
+
|
|
91
|
+
node = PipeNode(
|
|
92
|
+
id=node_cfg['id'],
|
|
93
|
+
processor=node_cfg['processor'],
|
|
94
|
+
inputs=inputs,
|
|
95
|
+
output=OutputConfig(**node_cfg['output']),
|
|
96
|
+
requires=node_cfg.get('requires', []),
|
|
97
|
+
config=node_cfg['config'],
|
|
98
|
+
schema=node_cfg.get('schema')
|
|
99
|
+
)
|
|
100
|
+
nodes.append(node)
|
|
101
|
+
|
|
102
|
+
pipeline = DAGPipeline(
|
|
103
|
+
name=config_dict['name'],
|
|
104
|
+
description=config_dict.get('description'),
|
|
105
|
+
nodes=nodes
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
return DAGExecutor(pipeline, verbose=verbose)
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def create_simple(
|
|
112
|
+
model_name: str,
|
|
113
|
+
device: str = "cpu",
|
|
114
|
+
threshold: float = 0.5,
|
|
115
|
+
template: str = "{label}",
|
|
116
|
+
max_length: Optional[int] = 512,
|
|
117
|
+
token: Optional[str] = None,
|
|
118
|
+
entities: Optional[Union[str, Path, List[Dict[str, Any]], Dict[str, Dict[str, Any]]]] = None,
|
|
119
|
+
precompute_embeddings: bool = False,
|
|
120
|
+
verbose: bool = False,
|
|
121
|
+
reranker_model: Optional[str] = None,
|
|
122
|
+
reranker_max_labels: int = 20,
|
|
123
|
+
reranker_threshold: Optional[float] = None,
|
|
124
|
+
) -> DAGExecutor:
|
|
125
|
+
"""
|
|
126
|
+
Create a minimal L2 -> L3 -> L0 pipeline from a model name.
|
|
127
|
+
|
|
128
|
+
Skips L1 (mention extraction). L2 serves as an in-memory entity store
|
|
129
|
+
that returns all loaded entities as candidates. L3 runs GLiNER for
|
|
130
|
+
entity linking. L0 aggregates in loose mode.
|
|
131
|
+
|
|
132
|
+
Optionally adds an L4 reranker after L3 for chunked candidate
|
|
133
|
+
re-evaluation when ``reranker_model`` is provided.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
model_name: HuggingFace model ID or local path.
|
|
137
|
+
device: Torch device ("cpu", "cuda", "cuda:0", ...).
|
|
138
|
+
threshold: Minimum score for entity predictions.
|
|
139
|
+
template: Format string for entity labels (e.g. "{label}: {description}").
|
|
140
|
+
max_length: Max sequence length for tokenization.
|
|
141
|
+
token: HuggingFace auth token for gated models.
|
|
142
|
+
entities: Optional entity data to load immediately. Accepts a file
|
|
143
|
+
path (str/Path to JSONL), a list of dicts, or a dict mapping
|
|
144
|
+
entity_id to entity data.
|
|
145
|
+
precompute_embeddings: If True and *entities* are provided,
|
|
146
|
+
pre-embed all entity labels after loading (BiEncoder models only).
|
|
147
|
+
verbose: Enable verbose logging.
|
|
148
|
+
reranker_model: Optional GLiNER model for L4 reranking. When set,
|
|
149
|
+
an L4 node is added after L3.
|
|
150
|
+
reranker_max_labels: Max candidate labels per L4 inference call.
|
|
151
|
+
reranker_threshold: Score threshold for L4. Defaults to *threshold*.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Configured DAGExecutor ready for ``execute``.
|
|
155
|
+
"""
|
|
156
|
+
nodes = [
|
|
157
|
+
{
|
|
158
|
+
"id": "l2",
|
|
159
|
+
"processor": "l2_chain",
|
|
160
|
+
"requires": [],
|
|
161
|
+
"inputs": {
|
|
162
|
+
"texts": {"source": "$input", "fields": "texts"},
|
|
163
|
+
},
|
|
164
|
+
"output": {"key": "l2_result"},
|
|
165
|
+
"schema": {"template": template},
|
|
166
|
+
"config": {
|
|
167
|
+
"max_candidates": 30,
|
|
168
|
+
"min_popularity": 0,
|
|
169
|
+
"layers": [
|
|
170
|
+
{
|
|
171
|
+
"type": "dict",
|
|
172
|
+
"priority": 0,
|
|
173
|
+
"write": True,
|
|
174
|
+
"search_mode": ["exact"],
|
|
175
|
+
}
|
|
176
|
+
],
|
|
177
|
+
},
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
"id": "l3",
|
|
181
|
+
"processor": "l3_batch",
|
|
182
|
+
"requires": ["l2"],
|
|
183
|
+
"inputs": {
|
|
184
|
+
"texts": {"source": "$input", "fields": "texts"},
|
|
185
|
+
"candidates": {"source": "l2_result", "fields": "candidates"},
|
|
186
|
+
},
|
|
187
|
+
"output": {"key": "l3_result"},
|
|
188
|
+
"schema": {"template": template},
|
|
189
|
+
"config": {
|
|
190
|
+
"model_name": model_name,
|
|
191
|
+
"device": device,
|
|
192
|
+
"threshold": threshold,
|
|
193
|
+
"flat_ner": True,
|
|
194
|
+
"multi_label": False,
|
|
195
|
+
"use_precomputed_embeddings": True,
|
|
196
|
+
"cache_embeddings": False,
|
|
197
|
+
"max_length": max_length,
|
|
198
|
+
"token": token,
|
|
199
|
+
},
|
|
200
|
+
},
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
l0_entity_source = "l3_result"
|
|
204
|
+
l0_requires = ["l2", "l3"]
|
|
205
|
+
|
|
206
|
+
if reranker_model:
|
|
207
|
+
nodes.append({
|
|
208
|
+
"id": "l4",
|
|
209
|
+
"processor": "l4_reranker",
|
|
210
|
+
"requires": ["l2", "l3"],
|
|
211
|
+
"inputs": {
|
|
212
|
+
"texts": {"source": "$input", "fields": "texts"},
|
|
213
|
+
"candidates": {"source": "l2_result", "fields": "candidates"},
|
|
214
|
+
},
|
|
215
|
+
"output": {"key": "l4_result"},
|
|
216
|
+
"schema": {"template": template},
|
|
217
|
+
"config": {
|
|
218
|
+
"model_name": reranker_model,
|
|
219
|
+
"device": device,
|
|
220
|
+
"threshold": reranker_threshold if reranker_threshold is not None else threshold,
|
|
221
|
+
"flat_ner": True,
|
|
222
|
+
"multi_label": False,
|
|
223
|
+
"max_labels": reranker_max_labels,
|
|
224
|
+
"max_length": max_length,
|
|
225
|
+
"token": token,
|
|
226
|
+
},
|
|
227
|
+
})
|
|
228
|
+
l0_entity_source = "l4_result"
|
|
229
|
+
l0_requires.append("l4")
|
|
230
|
+
|
|
231
|
+
nodes.append({
|
|
232
|
+
"id": "l0",
|
|
233
|
+
"processor": "l0_aggregator",
|
|
234
|
+
"requires": l0_requires,
|
|
235
|
+
"inputs": {
|
|
236
|
+
"l2_candidates": {"source": "l2_result", "fields": "candidates"},
|
|
237
|
+
"l3_entities": {"source": l0_entity_source, "fields": "entities"},
|
|
238
|
+
},
|
|
239
|
+
"output": {"key": "l0_result"},
|
|
240
|
+
"schema": {"template": template},
|
|
241
|
+
"config": {
|
|
242
|
+
"strict_matching": False,
|
|
243
|
+
"min_confidence": 0.0,
|
|
244
|
+
"include_unlinked": True,
|
|
245
|
+
"position_tolerance": 2,
|
|
246
|
+
},
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
config = {
|
|
250
|
+
"name": "simple",
|
|
251
|
+
"description": "Simple pipeline - L3 only with entity database",
|
|
252
|
+
"nodes": nodes,
|
|
253
|
+
}
|
|
254
|
+
executor = ProcessorFactory.create_from_dict(config, verbose=verbose)
|
|
255
|
+
|
|
256
|
+
if entities is not None:
|
|
257
|
+
executor.load_entities(entities)
|
|
258
|
+
if precompute_embeddings:
|
|
259
|
+
executor.precompute_embeddings()
|
|
260
|
+
|
|
261
|
+
return executor
|
glinker/core/registry.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Dict, Callable
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ProcessorRegistry:
|
|
5
|
+
"""Registry for processor factory functions"""
|
|
6
|
+
|
|
7
|
+
def __init__(self):
|
|
8
|
+
self._registry: Dict[str, Callable] = {}
|
|
9
|
+
|
|
10
|
+
def register(self, name: str):
|
|
11
|
+
"""Decorator to register processor factory"""
|
|
12
|
+
def decorator(factory: Callable):
|
|
13
|
+
self._registry[name] = factory
|
|
14
|
+
return factory
|
|
15
|
+
return decorator
|
|
16
|
+
|
|
17
|
+
def get(self, name: str) -> Callable:
|
|
18
|
+
"""Get processor factory by name"""
|
|
19
|
+
if name not in self._registry:
|
|
20
|
+
raise KeyError(
|
|
21
|
+
f"Processor '{name}' not found. "
|
|
22
|
+
f"Available: {list(self._registry.keys())}"
|
|
23
|
+
)
|
|
24
|
+
return self._registry[name]
|
|
25
|
+
|
|
26
|
+
def list_available(self) -> list[str]:
|
|
27
|
+
"""List all registered processor names"""
|
|
28
|
+
return list(self._registry.keys())
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
processor_registry = ProcessorRegistry()
|
glinker/l0/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
L0 - Aggregation Layer
|
|
3
|
+
|
|
4
|
+
Combines outputs from L1 (mention extraction), L2 (candidate retrieval),
|
|
5
|
+
and L3 (entity linking) into unified L0Entity structures with full pipeline context.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .models import L0Config, L0Input, L0Output, L0Entity, LinkedEntity
|
|
9
|
+
from .component import L0Component
|
|
10
|
+
from .processor import L0Processor, create_l0_processor
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"L0Config",
|
|
14
|
+
"L0Input",
|
|
15
|
+
"L0Output",
|
|
16
|
+
"L0Entity",
|
|
17
|
+
"LinkedEntity",
|
|
18
|
+
"L0Component",
|
|
19
|
+
"L0Processor",
|
|
20
|
+
"create_l0_processor"
|
|
21
|
+
]
|