OntoLearner 1.4.10__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ontolearner-1.4.10 → ontolearner-1.5.0}/PKG-INFO +2 -2
- ontolearner-1.5.0/ontolearner/VERSION +1 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/base/learner.py +41 -18
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/evaluation/metrics.py +72 -32
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/__init__.py +3 -2
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/label_mapper.py +5 -4
- ontolearner-1.5.0/ontolearner/learner/llm.py +465 -0
- ontolearner-1.5.0/ontolearner/learner/prompt.py +66 -0
- ontolearner-1.5.0/ontolearner/learner/rag/__init__.py +14 -0
- {ontolearner-1.4.10/ontolearner/learner → ontolearner-1.5.0/ontolearner/learner/rag}/rag.py +7 -2
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/retriever/__init__.py +1 -1
- ontolearner-1.4.10/ontolearner/learner/retriever/llm_retriever.py → ontolearner-1.5.0/ontolearner/learner/retriever/augmented_retriever.py +48 -39
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/retriever/learner.py +3 -4
- ontolearner-1.5.0/ontolearner/learner/taxonomy_discovery/alexbek.py +822 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/taxonomy_discovery/skhnlp.py +216 -156
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/text2onto/__init__.py +1 -1
- ontolearner-1.5.0/ontolearner/learner/text2onto/alexbek.py +598 -0
- ontolearner-1.5.0/ontolearner/learner/text2onto/sbunlp.py +603 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/biology.py +2 -3
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/chemistry.py +16 -18
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/ecology_environment.py +2 -3
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/general.py +4 -6
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/material_science_engineering.py +64 -45
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/medicine.py +2 -3
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/scholarly_knowledge.py +6 -9
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/processor.py +3 -3
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/text2onto/splitter.py +69 -6
- {ontolearner-1.4.10 → ontolearner-1.5.0}/pyproject.toml +2 -2
- ontolearner-1.4.10/ontolearner/VERSION +0 -1
- ontolearner-1.4.10/ontolearner/learner/llm.py +0 -208
- ontolearner-1.4.10/ontolearner/learner/prompt.py +0 -31
- ontolearner-1.4.10/ontolearner/learner/taxonomy_discovery/alexbek.py +0 -500
- ontolearner-1.4.10/ontolearner/learner/text2onto/alexbek.py +0 -1219
- ontolearner-1.4.10/ontolearner/learner/text2onto/sbunlp.py +0 -598
- {ontolearner-1.4.10 → ontolearner-1.5.0}/LICENSE +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/README.md +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/images/logo.png +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/__init__.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/_learner.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/_ontology.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/base/__init__.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/base/ontology.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/base/text2onto.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/data_structure/__init__.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/data_structure/data.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/data_structure/metric.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/evaluation/__init__.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/evaluation/evaluate.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/retriever/crossencoder.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/retriever/embedding.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/retriever/ngram.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/taxonomy_discovery/__init__.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/taxonomy_discovery/rwthdbis.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/taxonomy_discovery/sbunlp.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/term_typing/__init__.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/term_typing/alexbek.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/term_typing/rwthdbis.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/learner/term_typing/sbunlp.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/__init__.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/agriculture.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/arts_humanities.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/education.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/events.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/finance.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/food_beverage.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/geography.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/industry.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/law.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/library_cultural_heritage.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/news_media.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/social_sciences.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/units_measurements.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/upper_ontologies.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/ontology/web.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/text2onto/__init__.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/text2onto/batchifier.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/text2onto/general.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/text2onto/synthesizer.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/tools/__init__.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/tools/analyzer.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/tools/visualizer.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/utils/__init__.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/utils/io.py +0 -0
- {ontolearner-1.4.10 → ontolearner-1.5.0}/ontolearner/utils/train_test_split.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OntoLearner
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: OntoLearner: A Modular Python Library for Ontology Learning with LLMs.
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.13
|
|
16
16
|
Requires-Dist: Levenshtein
|
|
17
|
-
Requires-Dist: bitsandbytes (>=0.45.1,<0.
|
|
17
|
+
Requires-Dist: bitsandbytes (>=0.45.1,<1.0.0) ; platform_system == "Linux"
|
|
18
18
|
Requires-Dist: dspy (>=2.6.14,<3.0.0)
|
|
19
19
|
Requires-Dist: g4f
|
|
20
20
|
Requires-Dist: gensim
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
1.5.0
|
|
@@ -18,6 +18,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
18
18
|
import torch
|
|
19
19
|
import torch.nn.functional as F
|
|
20
20
|
from sentence_transformers import SentenceTransformer
|
|
21
|
+
from collections import defaultdict
|
|
21
22
|
|
|
22
23
|
class AutoLearner(ABC):
|
|
23
24
|
"""
|
|
@@ -70,6 +71,7 @@ class AutoLearner(ABC):
|
|
|
70
71
|
- "term-typing": Predict semantic types for terms
|
|
71
72
|
- "taxonomy-discovery": Identify hierarchical relationships
|
|
72
73
|
- "non-taxonomy-discovery": Identify non-hierarchical relationships
|
|
74
|
+
- "text2onto" : Extract ontology terms and their semantic types from documents
|
|
73
75
|
|
|
74
76
|
Raises:
|
|
75
77
|
NotImplementedError: If not implemented by concrete class.
|
|
@@ -81,6 +83,8 @@ class AutoLearner(ABC):
|
|
|
81
83
|
self._taxonomy_discovery(train_data, test=False)
|
|
82
84
|
elif task == 'non-taxonomic-re':
|
|
83
85
|
self._non_taxonomic_re(train_data, test=False)
|
|
86
|
+
elif task == 'text2onto':
|
|
87
|
+
self._text2onto(train_data, test=False)
|
|
84
88
|
else:
|
|
85
89
|
raise ValueError(f"{task} is not a valid task.")
|
|
86
90
|
|
|
@@ -103,6 +107,7 @@ class AutoLearner(ABC):
|
|
|
103
107
|
- term-typing: List of predicted types for each term
|
|
104
108
|
- taxonomy-discovery: Boolean predictions for relationships
|
|
105
109
|
- non-taxonomy-discovery: Predicted relation types
|
|
110
|
+
- text2onto : Extract ontology terms and their semantic types from documents
|
|
106
111
|
|
|
107
112
|
Raises:
|
|
108
113
|
NotImplementedError: If not implemented by concrete class.
|
|
@@ -115,6 +120,8 @@ class AutoLearner(ABC):
|
|
|
115
120
|
return self._taxonomy_discovery(eval_data, test=True)
|
|
116
121
|
elif task == 'non-taxonomic-re':
|
|
117
122
|
return self._non_taxonomic_re(eval_data, test=True)
|
|
123
|
+
elif task == 'text2onto':
|
|
124
|
+
return self._text2onto(eval_data, test=True)
|
|
118
125
|
else:
|
|
119
126
|
raise ValueError(f"{task} is not a valid task.")
|
|
120
127
|
|
|
@@ -147,6 +154,9 @@ class AutoLearner(ABC):
|
|
|
147
154
|
def _non_taxonomic_re(self, data: Any, test: bool = False) -> Optional[Any]:
|
|
148
155
|
pass
|
|
149
156
|
|
|
157
|
+
def _text2onto(self, data: Any, test: bool = False) -> Optional[Any]:
|
|
158
|
+
pass
|
|
159
|
+
|
|
150
160
|
def tasks_data_former(self, data: Any, task: str, test: bool = False) -> List[str | Dict[str, str]]:
|
|
151
161
|
formatted_data = []
|
|
152
162
|
if task == "term-typing":
|
|
@@ -171,6 +181,7 @@ class AutoLearner(ABC):
|
|
|
171
181
|
non_taxonomic_types = list(set(non_taxonomic_types))
|
|
172
182
|
non_taxonomic_res = list(set(non_taxonomic_res))
|
|
173
183
|
formatted_data = {"types": non_taxonomic_types, "relations": non_taxonomic_res}
|
|
184
|
+
|
|
174
185
|
return formatted_data
|
|
175
186
|
|
|
176
187
|
def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, str]]:
|
|
@@ -186,6 +197,26 @@ class AutoLearner(ABC):
|
|
|
186
197
|
formatted_data.append({"head": non_taxonomic_triplets.head,
|
|
187
198
|
"tail": non_taxonomic_triplets.tail,
|
|
188
199
|
"relation": non_taxonomic_triplets.relation})
|
|
200
|
+
if task == "text2onto":
|
|
201
|
+
terms2docs = data.get("terms2docs", {}) or {}
|
|
202
|
+
terms2types = data.get("terms2types", {}) or {}
|
|
203
|
+
|
|
204
|
+
# gold doc→terms
|
|
205
|
+
gold_terms = []
|
|
206
|
+
for term, doc_ids in terms2docs.items():
|
|
207
|
+
for doc_id in doc_ids or []:
|
|
208
|
+
gold_terms.append({"doc_id": doc_id, "term": term})
|
|
209
|
+
|
|
210
|
+
# gold doc→types derived via doc→terms + term→types
|
|
211
|
+
doc2types = defaultdict(set)
|
|
212
|
+
for term, doc_ids in terms2docs.items():
|
|
213
|
+
for doc_id in doc_ids or []:
|
|
214
|
+
for ty in (terms2types.get(term, []) or []):
|
|
215
|
+
if isinstance(ty, str) and ty.strip():
|
|
216
|
+
doc2types[doc_id].add(ty.strip())
|
|
217
|
+
gold_types = [{"doc_id": doc_id, "type": ty} for doc_id, tys in doc2types.items() for ty in tys]
|
|
218
|
+
return {"terms": gold_terms, "types": gold_types}
|
|
219
|
+
|
|
189
220
|
return formatted_data
|
|
190
221
|
|
|
191
222
|
class AutoLLM(ABC):
|
|
@@ -201,7 +232,7 @@ class AutoLLM(ABC):
|
|
|
201
232
|
tokenizer: The tokenizer associated with the model.
|
|
202
233
|
"""
|
|
203
234
|
|
|
204
|
-
def __init__(self, label_mapper: Any, device: str='cpu', token: str="") -> None:
|
|
235
|
+
def __init__(self, label_mapper: Any, device: str='cpu', token: str="", max_length: int = 512) -> None:
|
|
205
236
|
"""
|
|
206
237
|
Initialize the LLM component.
|
|
207
238
|
|
|
@@ -213,6 +244,7 @@ class AutoLLM(ABC):
|
|
|
213
244
|
self.device=device
|
|
214
245
|
self.model: Optional[Any] = None
|
|
215
246
|
self.tokenizer: Optional[Any] = None
|
|
247
|
+
self.max_length = max_length
|
|
216
248
|
|
|
217
249
|
|
|
218
250
|
def load(self, model_id: str) -> None:
|
|
@@ -236,10 +268,8 @@ class AutoLLM(ABC):
|
|
|
236
268
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', token=self.token)
|
|
237
269
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
238
270
|
if self.device == "cpu":
|
|
239
|
-
# device_map = "cpu"
|
|
240
271
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
241
272
|
model_id,
|
|
242
|
-
# device_map=device_map,
|
|
243
273
|
torch_dtype=torch.bfloat16,
|
|
244
274
|
token=self.token
|
|
245
275
|
)
|
|
@@ -248,11 +278,12 @@ class AutoLLM(ABC):
|
|
|
248
278
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
249
279
|
model_id,
|
|
250
280
|
device_map=device_map,
|
|
251
|
-
|
|
252
|
-
|
|
281
|
+
token=self.token,
|
|
282
|
+
trust_remote_code=True,
|
|
253
283
|
)
|
|
254
284
|
self.label_mapper.fit()
|
|
255
285
|
|
|
286
|
+
@torch.no_grad()
|
|
256
287
|
def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
|
|
257
288
|
"""
|
|
258
289
|
Generate text responses for the given input prompts.
|
|
@@ -276,29 +307,21 @@ class AutoLLM(ABC):
|
|
|
276
307
|
List of generated text responses, one for each input prompt.
|
|
277
308
|
Responses include the original input plus generated continuation.
|
|
278
309
|
"""
|
|
279
|
-
# Tokenize inputs and move to device
|
|
280
310
|
encoded_inputs = self.tokenizer(inputs,
|
|
281
311
|
return_tensors="pt",
|
|
282
|
-
|
|
283
|
-
truncation=True
|
|
312
|
+
max_length=self.max_length,
|
|
313
|
+
truncation=True,
|
|
314
|
+
padding=True).to(self.model.device)
|
|
284
315
|
input_ids = encoded_inputs["input_ids"]
|
|
285
316
|
input_length = input_ids.shape[1]
|
|
286
|
-
|
|
287
|
-
# Generate output
|
|
288
317
|
outputs = self.model.generate(
|
|
289
318
|
**encoded_inputs,
|
|
290
319
|
max_new_tokens=max_new_tokens,
|
|
291
|
-
pad_token_id=self.tokenizer.eos_token_id
|
|
320
|
+
pad_token_id=self.tokenizer.eos_token_id,
|
|
321
|
+
eos_token_id=self.tokenizer.eos_token_id
|
|
292
322
|
)
|
|
293
|
-
|
|
294
|
-
# Extract only the newly generated tokens (excluding prompt)
|
|
295
323
|
generated_tokens = outputs[:, input_length:]
|
|
296
|
-
|
|
297
|
-
# Decode only the generated part
|
|
298
324
|
decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
|
|
299
|
-
print(decoded_outputs)
|
|
300
|
-
print(self.label_mapper.predict(decoded_outputs))
|
|
301
|
-
# Map the decoded text to labels
|
|
302
325
|
return self.label_mapper.predict(decoded_outputs)
|
|
303
326
|
|
|
304
327
|
class AutoRetriever(ABC):
|
|
@@ -11,44 +11,84 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
from typing import List, Dict, Tuple, Set
|
|
14
|
+
from typing import List, Dict, Tuple, Set, Any, Union
|
|
15
15
|
|
|
16
16
|
SYMMETRIC_RELATIONS = {"equivalentclass", "sameas", "disjointwith"}
|
|
17
17
|
|
|
18
|
-
def text2onto_metrics(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
18
|
+
def text2onto_metrics(
|
|
19
|
+
y_true: Dict[str, Any],
|
|
20
|
+
y_pred: Dict[str, Any],
|
|
21
|
+
similarity_threshold: float = 0.8
|
|
22
|
+
) -> Dict[str, Any]:
|
|
23
|
+
"""
|
|
24
|
+
Expects:
|
|
25
|
+
y_true = {"terms": [{"doc_id": str, "term": str}, ...],
|
|
26
|
+
"types": [{"doc_id": str, "type": str}, ...]}
|
|
27
|
+
y_pred = same shape
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
{"terms": {...}, "types": {...}}
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def jaccard_similarity(text_a: str, text_b: str) -> float:
|
|
34
|
+
tokens_a = set(text_a.lower().split())
|
|
35
|
+
tokens_b = set(text_b.lower().split())
|
|
36
|
+
if not tokens_a and not tokens_b:
|
|
23
37
|
return 1.0
|
|
24
|
-
return len(
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
38
|
+
return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)
|
|
39
|
+
|
|
40
|
+
def pairs_to_strings(rows: List[Dict[str, str]], value_key: str) -> List[str]:
|
|
41
|
+
paired_strings: List[str] = []
|
|
42
|
+
for row in rows or []:
|
|
43
|
+
doc_id = (row.get("doc_id") or "").strip()
|
|
44
|
+
value = (row.get(value_key) or "").strip()
|
|
45
|
+
if doc_id and value:
|
|
46
|
+
# keep doc association + allow token Jaccard
|
|
47
|
+
paired_strings.append(f"{doc_id} {value}")
|
|
48
|
+
return paired_strings
|
|
49
|
+
|
|
50
|
+
def score_list(ground_truth_items: List[str], predicted_items: List[str]) -> Dict[str, Union[float, int]]:
|
|
51
|
+
matched_ground_truth_indices: Set[int] = set()
|
|
52
|
+
matched_predicted_indices: Set[int] = set()
|
|
53
|
+
|
|
54
|
+
for predicted_index, predicted_item in enumerate(predicted_items):
|
|
55
|
+
for ground_truth_index, ground_truth_item in enumerate(ground_truth_items):
|
|
56
|
+
if ground_truth_index in matched_ground_truth_indices:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
if jaccard_similarity(predicted_item, ground_truth_item) >= similarity_threshold:
|
|
60
|
+
matched_predicted_indices.add(predicted_index)
|
|
61
|
+
matched_ground_truth_indices.add(ground_truth_index)
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
total_correct = len(matched_predicted_indices)
|
|
65
|
+
total_predicted = len(predicted_items)
|
|
66
|
+
total_ground_truth = len(ground_truth_items)
|
|
67
|
+
|
|
68
|
+
precision = total_correct / total_predicted if total_predicted else 0.0
|
|
69
|
+
recall = total_correct / total_ground_truth if total_ground_truth else 0.0
|
|
70
|
+
f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
"f1_score": f1,
|
|
74
|
+
"precision": precision,
|
|
75
|
+
"recall": recall,
|
|
76
|
+
"total_correct": total_correct,
|
|
77
|
+
"total_predicted": total_predicted,
|
|
78
|
+
"total_ground_truth": total_ground_truth,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
ground_truth_terms = pairs_to_strings(y_true.get("terms", []), "term")
|
|
82
|
+
predicted_terms = pairs_to_strings(y_pred.get("terms", []), "term")
|
|
83
|
+
ground_truth_types = pairs_to_strings(y_true.get("types", []), "type")
|
|
84
|
+
predicted_types = pairs_to_strings(y_pred.get("types", []), "type")
|
|
85
|
+
|
|
86
|
+
terms_metrics = score_list(ground_truth_terms, predicted_terms)
|
|
87
|
+
types_metrics = score_list(ground_truth_types, predicted_types)
|
|
41
88
|
|
|
42
|
-
precision = total_correct / total_predicted if total_predicted > 0 else 0
|
|
43
|
-
recall = total_correct / total_ground_truth if total_ground_truth > 0 else 0
|
|
44
|
-
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
|
|
45
89
|
return {
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
"recall": recall,
|
|
49
|
-
"total_correct": total_correct,
|
|
50
|
-
"total_predicted": total_predicted,
|
|
51
|
-
"total_ground_truth": total_ground_truth
|
|
90
|
+
"terms": terms_metrics,
|
|
91
|
+
"types": types_metrics,
|
|
52
92
|
}
|
|
53
93
|
|
|
54
94
|
def term_typing_metrics(y_true: List[Dict[str, List[str]]], y_pred: List[Dict[str, List[str]]]) -> Dict[str, float | int]:
|
|
@@ -12,8 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from .llm import AutoLLMLearner, FalconLLM, MistralLLM
|
|
15
|
+
from .llm import AutoLLMLearner, FalconLLM, MistralLLM, LogitMistralLLM, \
|
|
16
|
+
QwenInstructLLM, QwenThinkingLLM, LogitAutoLLM, LogitQuantAutoLLM
|
|
16
17
|
from .retriever import AutoRetrieverLearner, LLMAugmentedRetrieverLearner
|
|
17
|
-
from .rag import AutoRAGLearner
|
|
18
|
+
from .rag import AutoRAGLearner, LLMAugmentedRAGLearner
|
|
18
19
|
from .prompt import StandardizedPrompting
|
|
19
20
|
from .label_mapper import LabelMapper
|
|
@@ -31,7 +31,7 @@ class LabelMapper:
|
|
|
31
31
|
ngram_range: Tuple=(1, 1),
|
|
32
32
|
label_dict: Dict[str, List[str]]=None,
|
|
33
33
|
analyzer: str = 'word',
|
|
34
|
-
iterator_no: int =
|
|
34
|
+
iterator_no: int = 1000):
|
|
35
35
|
"""
|
|
36
36
|
Initializes the TFIDFLabelMapper with a specified classifier and TF-IDF configuration.
|
|
37
37
|
|
|
@@ -45,11 +45,12 @@ class LabelMapper:
|
|
|
45
45
|
if label_dict is None:
|
|
46
46
|
label_dict = {
|
|
47
47
|
"yes": ["yes", "true"],
|
|
48
|
-
"no": ["no", "false"
|
|
48
|
+
"no": ["no", "false"]
|
|
49
49
|
}
|
|
50
|
-
self.
|
|
50
|
+
self.label_dict = label_dict
|
|
51
|
+
self.labels = [label.lower() for label in list(self.label_dict.keys())]
|
|
51
52
|
self.x_train, self.y_train = [], []
|
|
52
|
-
for label, candidates in label_dict.items():
|
|
53
|
+
for label, candidates in self.label_dict.items():
|
|
53
54
|
self.x_train += [label] + candidates
|
|
54
55
|
self.y_train += [label] * (len(candidates) + 1)
|
|
55
56
|
self.x_train = iterator_no * self.x_train
|