glinker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,547 @@
1
+ """
2
+ Pipeline configuration builder for easy setup.
3
+
4
+ ConfigBuilder: Unified builder with automatic defaults and full customization support.
5
+ """
6
+
7
+ from typing import List, Optional, Dict, Any, Literal
8
+ import yaml
9
+ from pathlib import Path
10
+
11
+
12
+ class ConfigBuilder:
13
+ """
14
+ Unified configuration builder for pipeline setup.
15
+
16
+ Automatically creates simple configs with dict-based L2 by default.
17
+ Supports full customization when needed.
18
+
19
+ Simple usage (auto dict layer):
20
+ builder = ConfigBuilder(name="demo")
21
+ builder.l1.gliner(model="...", labels=[...])
22
+ builder.l3.configure(model="...")
23
+ config = builder.get_config()
24
+ builder.save("config.yaml")
25
+
26
+ Advanced usage (custom layers):
27
+ builder = ConfigBuilder(name="production")
28
+ builder.l1.gliner(model="...", labels=[...])
29
+ builder.l2.add("redis", priority=2, ttl=3600)
30
+ builder.l2.add("postgres", priority=0)
31
+ builder.l2.embeddings(enabled=True)
32
+ builder.l3.configure(model="...")
33
+ builder.l0.configure(strict_matching=True)
34
+ builder.save("config.yaml")
35
+ """
36
+
37
+ class L1Builder:
38
+ """L1 configuration builder"""
39
+
40
+ def __init__(self, parent):
41
+ self.parent = parent
42
+
43
+ def spacy(
44
+ self,
45
+ model: str = "en_core_sci_sm",
46
+ device: str = "cpu",
47
+ batch_size: int = 32,
48
+ max_right_context: int = 50,
49
+ max_left_context: int = 50,
50
+ min_entity_length: int = 2,
51
+ include_noun_chunks: bool = False
52
+ ) -> "ConfigBuilder":
53
+ """Configure L1 with spaCy NER"""
54
+ self.parent._l1_type = "l1_spacy"
55
+ self.parent._l1_config = {
56
+ "model": model,
57
+ "device": device,
58
+ "batch_size": batch_size,
59
+ "max_right_context": max_right_context,
60
+ "max_left_context": max_left_context,
61
+ "min_entity_length": min_entity_length,
62
+ "include_noun_chunks": include_noun_chunks
63
+ }
64
+ return self.parent
65
+
66
+ def gliner(
67
+ self,
68
+ model: str,
69
+ labels: List[str],
70
+ token: Optional[str] = None,
71
+ device: str = "cpu",
72
+ threshold: float = 0.3,
73
+ flat_ner: bool = True,
74
+ multi_label: bool = False,
75
+ batch_size: int = 16,
76
+ max_right_context: int = 50,
77
+ max_left_context: int = 50,
78
+ min_entity_length: int = 2,
79
+ use_precomputed_embeddings: bool = False,
80
+ max_length: Optional[int] = 512
81
+ ) -> "ConfigBuilder":
82
+ """Configure L1 with GLiNER"""
83
+ self.parent._l1_type = "l1_gliner"
84
+ self.parent._l1_config = {
85
+ "model": model,
86
+ "labels": labels,
87
+ "token": token,
88
+ "device": device,
89
+ "threshold": threshold,
90
+ "flat_ner": flat_ner,
91
+ "multi_label": multi_label,
92
+ "batch_size": batch_size,
93
+ "max_right_context": max_right_context,
94
+ "max_left_context": max_left_context,
95
+ "min_entity_length": min_entity_length,
96
+ "use_precomputed_embeddings": use_precomputed_embeddings,
97
+ "max_length": max_length
98
+ }
99
+ return self.parent
100
+
101
+ class L2Builder:
102
+ """L2 configuration builder"""
103
+
104
+ def __init__(self, parent):
105
+ self.parent = parent
106
+
107
+ def add(
108
+ self,
109
+ layer_type: Literal["dict", "redis", "elasticsearch", "postgres"],
110
+ priority: int = 0,
111
+ write: bool = None,
112
+ search_mode: List[str] = None,
113
+ ttl: int = None,
114
+ cache_policy: str = None,
115
+ fuzzy_similarity: float = None,
116
+ **db_config
117
+ ) -> "ConfigBuilder":
118
+ """
119
+ Add a database layer to L2.
120
+
121
+ Args:
122
+ layer_type: Type of layer ("dict", "redis", "elasticsearch", "postgres")
123
+ priority: Layer priority (higher = checked first)
124
+ write: Whether to write to this layer (auto: True for cache, False for postgres)
125
+ search_mode: List of search modes (auto: ["exact"] for redis, ["exact", "fuzzy"] for others)
126
+ ttl: Cache TTL in seconds (auto: 0 for dict, 3600 for redis, 86400 for elasticsearch)
127
+ cache_policy: "always", "miss", or "hit" (auto: "always")
128
+ fuzzy_similarity: Minimum similarity for fuzzy search (auto: 0.6 for dict, 0.3 for others)
129
+ **db_config: Database-specific config (host, port, database, etc.)
130
+
131
+ Examples:
132
+ builder.l2.add("dict")
133
+ builder.l2.add("redis", priority=2, ttl=3600, host="localhost", port=6379)
134
+ builder.l2.add("elasticsearch", priority=1, hosts=["http://localhost:9200"])
135
+ builder.l2.add("postgres", priority=0, database="entities_db", user="postgres")
136
+ """
137
+ # Set defaults based on layer type
138
+ if write is None:
139
+ write = layer_type != "postgres" # Don't write to postgres by default
140
+
141
+ if search_mode is None:
142
+ search_mode = ["exact"] if layer_type == "redis" else ["exact", "fuzzy"]
143
+
144
+ if ttl is None:
145
+ ttl = {"dict": 0, "redis": 3600, "elasticsearch": 86400, "postgres": 0}.get(layer_type, 0)
146
+
147
+ if cache_policy is None:
148
+ cache_policy = "miss" if layer_type == "elasticsearch" else "always"
149
+
150
+ # Build layer config
151
+ layer = {
152
+ "type": layer_type,
153
+ "priority": priority,
154
+ "write": write,
155
+ "search_mode": search_mode,
156
+ "ttl": ttl,
157
+ "cache_policy": cache_policy,
158
+ "field_mapping": self._default_field_mapping()
159
+ }
160
+
161
+ # Add database-specific config
162
+ if layer_type == "dict":
163
+ if fuzzy_similarity is None:
164
+ fuzzy_similarity = 0.75
165
+ layer["fuzzy"] = {
166
+ "max_distance": 64,
167
+ "min_similarity": fuzzy_similarity,
168
+ "n_gram_size": 3,
169
+ "prefix_length": 1
170
+ }
171
+
172
+ elif layer_type == "redis":
173
+ layer["config"] = {
174
+ "host": db_config.get("host", "localhost"),
175
+ "port": db_config.get("port", 6379),
176
+ "db": db_config.get("db", 0)
177
+ }
178
+
179
+ elif layer_type == "elasticsearch":
180
+ if fuzzy_similarity is None:
181
+ fuzzy_similarity = 0.3
182
+ layer["config"] = {
183
+ "hosts": db_config.get("hosts", ["http://localhost:9200"]),
184
+ "index_name": db_config.get("index_name", "entities")
185
+ }
186
+ layer["fuzzy"] = {"min_similarity": fuzzy_similarity}
187
+
188
+ elif layer_type == "postgres":
189
+ if fuzzy_similarity is None:
190
+ fuzzy_similarity = 0.3
191
+ layer["config"] = {
192
+ "host": db_config.get("host", "localhost"),
193
+ "port": db_config.get("port", 5432),
194
+ "database": db_config.get("database", "entities_db"),
195
+ "user": db_config.get("user", "postgres"),
196
+ "password": db_config.get("password", "postgres")
197
+ }
198
+ layer["fuzzy"] = {"min_similarity": fuzzy_similarity}
199
+
200
+ self.parent._l2_layers.append(layer)
201
+ return self.parent
202
+
203
+ def embeddings(
204
+ self,
205
+ enabled: bool = True,
206
+ model_name: str = "knowledgator/gliner-linker-large-v1.0",
207
+ dim: int = 768,
208
+ precompute_on_load: bool = False
209
+ ) -> "ConfigBuilder":
210
+ """Configure embeddings for L2 (BiEncoder support)"""
211
+ self.parent._l2_embeddings = {
212
+ "enabled": enabled,
213
+ "model_name": model_name,
214
+ "dim": dim,
215
+ "precompute_on_load": precompute_on_load
216
+ }
217
+
218
+ # Add embedding fields to all layers
219
+ for layer in self.parent._l2_layers:
220
+ layer["field_mapping"]["embedding"] = "embedding"
221
+ layer["field_mapping"]["embedding_model_id"] = "embedding_model_id"
222
+
223
+ return self.parent
224
+
225
+ def _default_field_mapping(self) -> Dict[str, str]:
226
+ """Default field mapping"""
227
+ mapping = {
228
+ "entity_id": "entity_id",
229
+ "label": "label",
230
+ "aliases": "aliases",
231
+ "description": "description",
232
+ "entity_type": "entity_type",
233
+ "popularity": "popularity"
234
+ }
235
+
236
+ # Add embedding fields if embeddings enabled
237
+ if self.parent._l2_embeddings:
238
+ mapping["embedding"] = "embedding"
239
+ mapping["embedding_model_id"] = "embedding_model_id"
240
+
241
+ return mapping
242
+
243
+ class L3Builder:
244
+ """L3 configuration builder"""
245
+
246
+ def __init__(self, parent):
247
+ self.parent = parent
248
+
249
+ def configure(
250
+ self,
251
+ model: str = "knowledgator/gliner-linker-large-v1.0",
252
+ token: Optional[str] = None,
253
+ device: str = "cpu",
254
+ threshold: float = 0.5,
255
+ flat_ner: bool = True,
256
+ multi_label: bool = False,
257
+ batch_size: int = 1,
258
+ use_precomputed_embeddings: bool = False,
259
+ cache_embeddings: bool = False,
260
+ max_length: Optional[int] = 512
261
+ ) -> "ConfigBuilder":
262
+ """Configure L3 entity disambiguation"""
263
+ self.parent._l3_config = {
264
+ "model_name": model,
265
+ "huggingface_token": token,
266
+ "device": device,
267
+ "threshold": threshold,
268
+ "flat_ner": flat_ner,
269
+ "multi_label": multi_label,
270
+ "batch_size": batch_size,
271
+ "use_precomputed_embeddings": use_precomputed_embeddings,
272
+ "cache_embeddings": cache_embeddings,
273
+ "max_length": max_length
274
+ }
275
+ return self.parent
276
+
277
+ class L4Builder:
278
+ """L4 configuration builder (optional GLiNER reranker with chunking)"""
279
+
280
+ def __init__(self, parent):
281
+ self.parent = parent
282
+
283
+ def configure(
284
+ self,
285
+ model: str = "knowledgator/gliner-linker-large-v1.0",
286
+ token: Optional[str] = None,
287
+ device: str = "cpu",
288
+ threshold: float = 0.5,
289
+ flat_ner: bool = True,
290
+ multi_label: bool = False,
291
+ max_labels: int = 20,
292
+ max_length: Optional[int] = 512
293
+ ) -> "ConfigBuilder":
294
+ """Configure L4 GLiNER reranker with candidate chunking.
295
+
296
+ Args:
297
+ model: GLiNER model (uni-encoder)
298
+ threshold: Minimum score for entity predictions
299
+ max_labels: Maximum candidate labels per inference call.
300
+ Candidates exceeding this are split into chunks.
301
+ """
302
+ self.parent._l4_config = {
303
+ "model_name": model,
304
+ "token": token,
305
+ "device": device,
306
+ "threshold": threshold,
307
+ "flat_ner": flat_ner,
308
+ "multi_label": multi_label,
309
+ "max_labels": max_labels,
310
+ "max_length": max_length
311
+ }
312
+ return self.parent
313
+
314
+ class L0Builder:
315
+ """L0 configuration builder"""
316
+
317
+ def __init__(self, parent):
318
+ self.parent = parent
319
+
320
+ def configure(
321
+ self,
322
+ min_confidence: float = 0.0,
323
+ include_unlinked: bool = True,
324
+ return_all_candidates: bool = False,
325
+ strict_matching: bool = True,
326
+ position_tolerance: int = 2
327
+ ) -> "ConfigBuilder":
328
+ """Configure L0 aggregation parameters"""
329
+ self.parent._l0_config = {
330
+ "min_confidence": min_confidence,
331
+ "include_unlinked": include_unlinked,
332
+ "return_all_candidates": return_all_candidates,
333
+ "strict_matching": strict_matching,
334
+ "position_tolerance": position_tolerance
335
+ }
336
+ return self.parent
337
+
338
+ def __init__(self, name: str = "pipeline", description: str = None):
339
+ self.name = name
340
+ self.description = description or f"{name} - auto-generated configuration"
341
+ self._l1_config = None
342
+ self._l1_type = None
343
+ self._l2_layers = []
344
+ self._l2_embeddings = None
345
+ self._l3_config = None
346
+ self._l4_config = None
347
+ self._l0_config = {
348
+ "min_confidence": 0.0,
349
+ "include_unlinked": True,
350
+ "return_all_candidates": False,
351
+ "strict_matching": True,
352
+ "position_tolerance": 2
353
+ }
354
+ self._schema_template = "{label}: {description}"
355
+
356
+ # Initialize builders
357
+ self.l1 = self.L1Builder(self)
358
+ self.l2 = self.L2Builder(self)
359
+ self.l3 = self.L3Builder(self)
360
+ self.l4 = self.L4Builder(self)
361
+ self.l0 = self.L0Builder(self)
362
+
363
+ def set_schema_template(self, template: str) -> "ConfigBuilder":
364
+ """Set label formatting template for L2/L3/L0"""
365
+ self._schema_template = template
366
+ return self
367
+
368
+ def get_config(self) -> Dict[str, Any]:
369
+ """
370
+ Get pipeline configuration as Python dictionary.
371
+
372
+ Returns:
373
+ dict: Complete pipeline configuration
374
+ """
375
+ return self.build()
376
+
377
+ def build(self) -> Dict[str, Any]:
378
+ """Build pipeline configuration dictionary"""
379
+ if not self._l1_type or not self._l1_config:
380
+ raise ValueError("L1 configuration is required. Call builder.l1.spacy() or builder.l1.gliner() first.")
381
+
382
+ if not self._l3_config:
383
+ raise ValueError("L3 configuration is required. Call builder.l3.configure() first.")
384
+
385
+ # Auto-add dict layer if no L2 layers specified
386
+ if not self._l2_layers:
387
+ self._l2_layers.append({
388
+ "type": "dict",
389
+ "priority": 0,
390
+ "write": True,
391
+ "search_mode": ["exact", "fuzzy"],
392
+ "ttl": 0,
393
+ "cache_policy": "always",
394
+ "field_mapping": {
395
+ "entity_id": "entity_id",
396
+ "label": "label",
397
+ "aliases": "aliases",
398
+ "description": "description",
399
+ "entity_type": "entity_type",
400
+ "popularity": "popularity"
401
+ },
402
+ "fuzzy": {
403
+ "max_distance": 64,
404
+ "min_similarity": 0.6,
405
+ "n_gram_size": 3,
406
+ "prefix_length": 1
407
+ }
408
+ })
409
+
410
+ # Build L2 config
411
+ l2_config = {
412
+ "max_candidates": 10 if self._l2_embeddings else 5,
413
+ "min_popularity": 0,
414
+ "layers": self._l2_layers
415
+ }
416
+
417
+ if self._l2_embeddings:
418
+ l2_config["embeddings"] = self._l2_embeddings
419
+
420
+ nodes = [
421
+ # L1 Node
422
+ {
423
+ "id": "l1",
424
+ "processor": self._l1_type,
425
+ "inputs": {
426
+ "texts": {
427
+ "source": "$input",
428
+ "fields": "texts"
429
+ }
430
+ },
431
+ "output": {"key": "l1_result"},
432
+ "config": self._l1_config
433
+ },
434
+ # L2 Node
435
+ {
436
+ "id": "l2",
437
+ "processor": "l2_chain",
438
+ "requires": ["l1"],
439
+ "inputs": {
440
+ "mentions": {
441
+ "source": "l1_result",
442
+ "fields": "entities"
443
+ }
444
+ },
445
+ "output": {"key": "l2_result"},
446
+ "schema": {"template": self._schema_template},
447
+ "config": l2_config
448
+ },
449
+ # L3 Node
450
+ {
451
+ "id": "l3",
452
+ "processor": "l3_batch",
453
+ "requires": ["l1", "l2"],
454
+ "inputs": {
455
+ "texts": {
456
+ "source": "$input",
457
+ "fields": "texts"
458
+ },
459
+ "candidates": {
460
+ "source": "l2_result",
461
+ "fields": "candidates"
462
+ },
463
+ "l1_entities": {
464
+ "source": "l1_result",
465
+ "fields": "entities"
466
+ }
467
+ },
468
+ "output": {"key": "l3_result"},
469
+ "schema": {"template": self._schema_template},
470
+ "config": self._l3_config
471
+ },
472
+ ]
473
+
474
+ # Determine which result L0 reads entity predictions from
475
+ l0_entity_source = "l3_result"
476
+ l0_requires = ["l1", "l2", "l3"]
477
+
478
+ # Optional L4 reranker node
479
+ if self._l4_config:
480
+ nodes.append({
481
+ "id": "l4",
482
+ "processor": "l4_reranker",
483
+ "requires": ["l1", "l2", "l3"],
484
+ "inputs": {
485
+ "texts": {
486
+ "source": "$input",
487
+ "fields": "texts"
488
+ },
489
+ "candidates": {
490
+ "source": "l2_result",
491
+ "fields": "candidates"
492
+ },
493
+ "l1_entities": {
494
+ "source": "l1_result",
495
+ "fields": "entities"
496
+ }
497
+ },
498
+ "output": {"key": "l4_result"},
499
+ "schema": {"template": self._schema_template},
500
+ "config": self._l4_config
501
+ })
502
+ l0_entity_source = "l4_result"
503
+ l0_requires.append("l4")
504
+
505
+ # L0 Node
506
+ nodes.append({
507
+ "id": "l0",
508
+ "processor": "l0_aggregator",
509
+ "requires": l0_requires,
510
+ "inputs": {
511
+ "l1_entities": {
512
+ "source": "l1_result",
513
+ "fields": "entities"
514
+ },
515
+ "l2_candidates": {
516
+ "source": "l2_result",
517
+ "fields": "candidates"
518
+ },
519
+ "l3_entities": {
520
+ "source": l0_entity_source,
521
+ "fields": "entities"
522
+ }
523
+ },
524
+ "output": {"key": "l0_result"},
525
+ "config": self._l0_config,
526
+ "schema": {"template": self._schema_template}
527
+ })
528
+
529
+ config = {
530
+ "name": self.name,
531
+ "description": self.description,
532
+ "nodes": nodes
533
+ }
534
+
535
+ return config
536
+
537
+ def save(self, filepath: str) -> None:
538
+ """Save configuration to YAML file"""
539
+ config = self.build()
540
+
541
+ # Create directory if needed
542
+ Path(filepath).parent.mkdir(parents=True, exist_ok=True)
543
+
544
+ with open(filepath, 'w') as f:
545
+ yaml.dump(config, f, default_flow_style=False, sort_keys=False)
546
+
547
+ print(f"✓ Configuration saved to {filepath}")