wikontic 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikontic/__init__.py +16 -0
- wikontic/create_ontological_triplets_db.py +193 -0
- wikontic/create_triplets_db.py +259 -0
- wikontic/create_wikidata_ontology_db.py +555 -0
- wikontic/utils/__init__.py +7 -0
- wikontic/utils/base_inference_with_db.py +329 -0
- wikontic/utils/dynamic_aligner.py +281 -0
- wikontic/utils/inference_with_db.py +224 -0
- wikontic/utils/ontology_mappings/entity_hierarchy.json +1 -0
- wikontic/utils/ontology_mappings/entity_names.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2aliases.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2hierarchy.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2label.json +1 -0
- wikontic/utils/ontology_mappings/enum_entity_ids.json +1 -0
- wikontic/utils/ontology_mappings/enum_prop_ids.json +1 -0
- wikontic/utils/ontology_mappings/label2entity.json +1 -0
- wikontic/utils/ontology_mappings/obj_constraint2prop.json +1 -0
- wikontic/utils/ontology_mappings/prop2aliases.json +1 -0
- wikontic/utils/ontology_mappings/prop2constraints.json +1 -0
- wikontic/utils/ontology_mappings/prop2data_type.json +1 -0
- wikontic/utils/ontology_mappings/prop2label.json +1 -0
- wikontic/utils/ontology_mappings/propid2enum.json +1 -0
- wikontic/utils/ontology_mappings/subj_constraint2prop.json +1 -0
- wikontic/utils/ontology_mappings/subject_object_constraints.json +1 -0
- wikontic/utils/openai_utils.py +517 -0
- wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types.txt +17 -0
- wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types_dialog_bench.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_object_names.txt +17 -0
- wikontic/utils/prompts/name_refinement/rank_object_names_dialog_bench.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_object_qualifiers.txt +20 -0
- wikontic/utils/prompts/name_refinement/rank_subject_names.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_subject_names_dialog_bench.txt +20 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_entity_types.txt +26 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_relation.txt +24 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_relation_and_types.txt +28 -0
- wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question.txt +17 -0
- wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question_wo_types.txt +16 -0
- wikontic/utils/prompts/qa/prompt_entity_extraction_from_question.txt +3 -0
- wikontic/utils/prompts/qa/prompt_is_answered.txt +43 -0
- wikontic/utils/prompts/qa/qa_collapsing_prompt.txt +22 -0
- wikontic/utils/prompts/qa/qa_prompt.txt +5 -0
- wikontic/utils/prompts/qa/qa_prompt_hotpot.txt +6 -0
- wikontic/utils/prompts/qa/question_decomposition_1.txt +7 -0
- wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt +75 -0
- wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench_in_russian.txt +78 -0
- wikontic/utils/prompts/triplet_extraction/propmt_1_types_qualifiers.txt +91 -0
- wikontic/utils/structured_aligner.py +606 -0
- wikontic/utils/structured_inference_with_db.py +561 -0
- wikontic-0.0.3.dist-info/METADATA +111 -0
- wikontic-0.0.3.dist-info/RECORD +53 -0
- wikontic-0.0.3.dist-info/WHEEL +5 -0
- wikontic-0.0.3.dist-info/licenses/LICENSE +19 -0
- wikontic-0.0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,561 @@
|
|
|
1
|
+
from unidecode import unidecode
|
|
2
|
+
import re
|
|
3
|
+
import warnings
|
|
4
|
+
from typing import Dict, List, Tuple
|
|
5
|
+
from langchain.tools import tool
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from .base_inference_with_db import BaseInferenceWithDB
|
|
9
|
+
|
|
10
|
+
warnings.filterwarnings("ignore")
|
|
11
|
+
logger = logging.getLogger("StructuredInferenceWithDB")
|
|
12
|
+
logger.setLevel(logging.ERROR)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StructuredInferenceWithDB(BaseInferenceWithDB):
|
|
16
|
+
def __init__(self, extractor, aligner, triplets_db):
|
|
17
|
+
self.extractor = extractor
|
|
18
|
+
self.aligner = aligner
|
|
19
|
+
self.triplets_db = triplets_db
|
|
20
|
+
|
|
21
|
+
self.extract_triplets_with_ontology_filtering_tool = tool(
|
|
22
|
+
self.extract_triplets_with_ontology_filtering
|
|
23
|
+
)
|
|
24
|
+
self.extract_triplets_with_ontology_filtering_and_add_to_db_tool = tool(
|
|
25
|
+
self.extract_triplets_with_ontology_filtering_and_add_to_db
|
|
26
|
+
)
|
|
27
|
+
self.retrieve_similar_entity_names_tool = tool(
|
|
28
|
+
self.retrieve_similar_entity_names
|
|
29
|
+
)
|
|
30
|
+
self.identify_relevant_entities_from_question_tool = tool(
|
|
31
|
+
self.identify_relevant_entities_from_question_with_llm
|
|
32
|
+
)
|
|
33
|
+
self.get_1_hop_supporting_triplets_tool = tool(
|
|
34
|
+
self.get_1_hop_supporting_triplets
|
|
35
|
+
)
|
|
36
|
+
# 1st step extraction without database
|
|
37
|
+
|
|
38
|
+
def _refine_entity_types(self, text, triplet):
|
|
39
|
+
"""
|
|
40
|
+
Refine entity types using LLM.
|
|
41
|
+
"""
|
|
42
|
+
candidate_subj_type_ids, candidate_obj_type_ids = (
|
|
43
|
+
self.aligner.retrieve_similar_entity_types(triplet=triplet)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
candidate_entity_type_id_2_label = self.aligner.retrieve_entity_type_labels(
|
|
47
|
+
candidate_subj_type_ids + candidate_obj_type_ids
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
candidate_entity_type_label_2_id = {
|
|
51
|
+
entity_label: entity_id
|
|
52
|
+
for entity_id, entity_label in candidate_entity_type_id_2_label.items()
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
candidate_subject_types = [
|
|
56
|
+
candidate_entity_type_id_2_label[t] for t in candidate_subj_type_ids
|
|
57
|
+
]
|
|
58
|
+
candidate_object_types = [
|
|
59
|
+
candidate_entity_type_id_2_label[t] for t in candidate_obj_type_ids
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
# no need to refine if the triplet's types are in the candidate types
|
|
63
|
+
if (
|
|
64
|
+
triplet["subject_type"] in candidate_subject_types
|
|
65
|
+
and triplet["object_type"] in candidate_object_types
|
|
66
|
+
):
|
|
67
|
+
refined_subject_type, refined_object_type = (
|
|
68
|
+
triplet["subject_type"],
|
|
69
|
+
triplet["object_type"],
|
|
70
|
+
)
|
|
71
|
+
refined_subject_type_id = candidate_entity_type_label_2_id[
|
|
72
|
+
triplet["subject_type"]
|
|
73
|
+
]
|
|
74
|
+
refined_object_type_id = candidate_entity_type_label_2_id[
|
|
75
|
+
triplet["object_type"]
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
else:
|
|
79
|
+
# if the triplet's subject type is in the candidate types,
|
|
80
|
+
# then only refine the subject type
|
|
81
|
+
if triplet["subject_type"] in candidate_subject_types:
|
|
82
|
+
candidate_subject_types = [triplet["subject_type"]]
|
|
83
|
+
# if the triplet's object type is in the candidate types,
|
|
84
|
+
# then only refine the object type
|
|
85
|
+
if triplet["object_type"] in candidate_object_types:
|
|
86
|
+
candidate_object_types = [triplet["object_type"]]
|
|
87
|
+
|
|
88
|
+
self.extractor.reset_error_state()
|
|
89
|
+
refined_entity_types = self.extractor.refine_entity_types(
|
|
90
|
+
text=text,
|
|
91
|
+
triplet=triplet,
|
|
92
|
+
candidate_subject_types=candidate_subject_types,
|
|
93
|
+
candidate_object_types=candidate_object_types,
|
|
94
|
+
)
|
|
95
|
+
refined_subject_type, refined_object_type = (
|
|
96
|
+
refined_entity_types["subject_type"],
|
|
97
|
+
refined_entity_types["object_type"],
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
refined_subject_type_id = (
|
|
101
|
+
candidate_entity_type_label_2_id[refined_subject_type]
|
|
102
|
+
if refined_subject_type in candidate_subject_types
|
|
103
|
+
else None
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
refined_object_type_id = (
|
|
107
|
+
candidate_entity_type_label_2_id[refined_object_type]
|
|
108
|
+
if refined_object_type in candidate_object_types
|
|
109
|
+
else None
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return (
|
|
113
|
+
refined_subject_type,
|
|
114
|
+
refined_subject_type_id,
|
|
115
|
+
refined_object_type,
|
|
116
|
+
refined_object_type_id,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def _get_candidate_entity_properties(
|
|
120
|
+
self, triplet: Dict[str, str], subj_type_ids: List[str], obj_type_ids: List[str]
|
|
121
|
+
) -> Tuple[List[Tuple[str, str]], Dict[str, dict]]:
|
|
122
|
+
"""
|
|
123
|
+
Retrieve candidate properties and their labels/constraints.
|
|
124
|
+
"""
|
|
125
|
+
# Get the list of tuples (<property_id>, <property_direction>)
|
|
126
|
+
properties: List[Tuple[str, str]] = (
|
|
127
|
+
self.aligner.retrieve_properties_for_entity_type(
|
|
128
|
+
target_relation=triplet["relation"],
|
|
129
|
+
object_types=obj_type_ids,
|
|
130
|
+
subject_types=subj_type_ids,
|
|
131
|
+
k=10,
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
# Get dict {<prop_id>:
|
|
135
|
+
# {"label": <prop_label>,
|
|
136
|
+
# "valid_subject_type_ids": <valid_subject_type_ids>,
|
|
137
|
+
# "valid_object_type_ids": <valid_object_type_ids>}}
|
|
138
|
+
prop_2_label_and_constraint = (
|
|
139
|
+
self.aligner.retrieve_properties_labels_and_constraints(
|
|
140
|
+
property_id_list=[p[0] for p in properties]
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
return properties, prop_2_label_and_constraint
|
|
144
|
+
|
|
145
|
+
def _refine_relation(
|
|
146
|
+
self, text, triplet, refined_subject_type_id, refined_object_type_id
|
|
147
|
+
):
|
|
148
|
+
"""
|
|
149
|
+
Refine relation using LLM.
|
|
150
|
+
"""
|
|
151
|
+
# if refined subject and object types are in the candidate types,
|
|
152
|
+
# then refine the relation
|
|
153
|
+
if refined_subject_type_id and refined_object_type_id:
|
|
154
|
+
relation_direction_candidate_pairs, prop_2_label_and_constraint = (
|
|
155
|
+
self._get_candidate_entity_properties(
|
|
156
|
+
triplet=triplet,
|
|
157
|
+
subj_type_ids=[refined_subject_type_id],
|
|
158
|
+
obj_type_ids=[refined_object_type_id],
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
candidate_relations = [
|
|
162
|
+
prop_2_label_and_constraint[p[0]]["label"]
|
|
163
|
+
for p in relation_direction_candidate_pairs
|
|
164
|
+
]
|
|
165
|
+
# no need to refine
|
|
166
|
+
# if the triplet's relation is in the candidate relations
|
|
167
|
+
if triplet["relation"] in candidate_relations:
|
|
168
|
+
refined_relation = triplet["relation"]
|
|
169
|
+
else:
|
|
170
|
+
self.extractor.reset_error_state()
|
|
171
|
+
refined_relation = self.extractor.refine_relation(
|
|
172
|
+
text=text, triplet=triplet, candidate_relations=candidate_relations
|
|
173
|
+
)["relation"]
|
|
174
|
+
# if refined subject and object types are not in the candidate types,
|
|
175
|
+
# leave relation as it is
|
|
176
|
+
else:
|
|
177
|
+
refined_relation = triplet["relation"]
|
|
178
|
+
candidate_relations = []
|
|
179
|
+
|
|
180
|
+
# if refined relation is in the candidate relations,
|
|
181
|
+
# then identify the relation direction
|
|
182
|
+
if refined_relation in candidate_relations:
|
|
183
|
+
refined_relation_id_candidates = [
|
|
184
|
+
p_id
|
|
185
|
+
for p_id in prop_2_label_and_constraint
|
|
186
|
+
if prop_2_label_and_constraint[p_id]["label"] == refined_relation
|
|
187
|
+
]
|
|
188
|
+
refined_relation_id = refined_relation_id_candidates[0]
|
|
189
|
+
refined_relation_directions = [
|
|
190
|
+
p[1]
|
|
191
|
+
for p in relation_direction_candidate_pairs
|
|
192
|
+
if p[0] == refined_relation_id
|
|
193
|
+
]
|
|
194
|
+
refined_relation_direction = (
|
|
195
|
+
"direct" if "direct" in refined_relation_directions else "inverse"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
prop_subject_type_ids = [
|
|
199
|
+
prop_2_label_and_constraint[prop]["valid_subject_type_ids"]
|
|
200
|
+
for prop in prop_2_label_and_constraint
|
|
201
|
+
if prop_2_label_and_constraint[prop]["label"] == refined_relation
|
|
202
|
+
][0]
|
|
203
|
+
prop_object_type_ids = [
|
|
204
|
+
prop_2_label_and_constraint[prop]["valid_object_type_ids"]
|
|
205
|
+
for prop in prop_2_label_and_constraint
|
|
206
|
+
if prop_2_label_and_constraint[prop]["label"] == refined_relation
|
|
207
|
+
][0]
|
|
208
|
+
|
|
209
|
+
else:
|
|
210
|
+
refined_relation_direction = "direct"
|
|
211
|
+
refined_relation_id = None
|
|
212
|
+
prop_subject_type_ids = []
|
|
213
|
+
prop_object_type_ids = []
|
|
214
|
+
|
|
215
|
+
return (
|
|
216
|
+
refined_relation,
|
|
217
|
+
refined_relation_id,
|
|
218
|
+
refined_relation_direction,
|
|
219
|
+
prop_subject_type_ids,
|
|
220
|
+
prop_object_type_ids,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def _validate_backbone(
|
|
224
|
+
self,
|
|
225
|
+
refined_subject_type: str,
|
|
226
|
+
refined_object_type: str,
|
|
227
|
+
refined_relation: str,
|
|
228
|
+
refined_object_type_id: str,
|
|
229
|
+
refined_subject_type_id: str,
|
|
230
|
+
refined_relation_id: str,
|
|
231
|
+
valid_subject_type_ids: List[str],
|
|
232
|
+
valid_object_type_ids: List[str],
|
|
233
|
+
):
|
|
234
|
+
"""
|
|
235
|
+
Check if the selected backbone_triplet's types and relation are in the valid sets.
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
exception_msg = ""
|
|
239
|
+
if not refined_relation_id:
|
|
240
|
+
exception_msg += "Refined relation not in candidate relations\n"
|
|
241
|
+
if not refined_subject_type_id:
|
|
242
|
+
exception_msg += "Refined subject type not in candidate subject types\n"
|
|
243
|
+
if not refined_object_type_id:
|
|
244
|
+
exception_msg += "Refined object type not in candidate object types\n"
|
|
245
|
+
|
|
246
|
+
if exception_msg != "":
|
|
247
|
+
return False, exception_msg
|
|
248
|
+
|
|
249
|
+
else:
|
|
250
|
+
|
|
251
|
+
subject_type_hierarchy = self.aligner.retrieve_entity_type_hierarchy(
|
|
252
|
+
refined_subject_type
|
|
253
|
+
)
|
|
254
|
+
object_type_hierarchy = self.aligner.retrieve_entity_type_hierarchy(
|
|
255
|
+
refined_object_type
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
if valid_subject_type_ids == ["ANY"]:
|
|
259
|
+
valid_subject_type_ids = subject_type_hierarchy
|
|
260
|
+
if valid_object_type_ids == ["ANY"]:
|
|
261
|
+
valid_object_type_ids = object_type_hierarchy
|
|
262
|
+
|
|
263
|
+
if any(
|
|
264
|
+
[t in subject_type_hierarchy for t in valid_subject_type_ids]
|
|
265
|
+
) and any([t in object_type_hierarchy for t in valid_object_type_ids]):
|
|
266
|
+
return True, exception_msg
|
|
267
|
+
else:
|
|
268
|
+
exception_msg += "Triplet backbone violates property constraints\n"
|
|
269
|
+
return False, exception_msg
|
|
270
|
+
|
|
271
|
+
def _refine_entity_name(self, text, triplet, sample_id, is_object=False):
|
|
272
|
+
"""
|
|
273
|
+
Refine entity names using type constraints.
|
|
274
|
+
"""
|
|
275
|
+
self.extractor.reset_error_state()
|
|
276
|
+
if is_object:
|
|
277
|
+
entity = unidecode(triplet["object"])
|
|
278
|
+
entity_type = triplet["object_type"]
|
|
279
|
+
entity_hierarchy = self.aligner.retrieve_entity_type_hierarchy(entity_type)
|
|
280
|
+
else:
|
|
281
|
+
entity = unidecode(triplet["subject"])
|
|
282
|
+
entity_type = triplet["subject_type"]
|
|
283
|
+
entity_hierarchy = []
|
|
284
|
+
|
|
285
|
+
# do not change time or quantity entities (of objects!)
|
|
286
|
+
if any([t in ["Q186408", "Q309314"] for t in entity_hierarchy]):
|
|
287
|
+
updated_entity = entity
|
|
288
|
+
else:
|
|
289
|
+
# if not time or quantity entities -> retrieve similar entities by type and name similarity
|
|
290
|
+
similar_entities = self.aligner.retrieve_entity_by_type(
|
|
291
|
+
entity_name=entity, entity_type=entity_type, sample_id=sample_id
|
|
292
|
+
)
|
|
293
|
+
# if there are similar entities -> refine entity name
|
|
294
|
+
if len(similar_entities) > 0:
|
|
295
|
+
# if exact match found -> return the exact match
|
|
296
|
+
if entity in similar_entities:
|
|
297
|
+
updated_entity = similar_entities[entity]
|
|
298
|
+
else:
|
|
299
|
+
# if not exact match -> refine entity name
|
|
300
|
+
updated_entity = self.extractor.refine_entity(
|
|
301
|
+
text=text,
|
|
302
|
+
triplet=triplet,
|
|
303
|
+
candidates=list(similar_entities.values()),
|
|
304
|
+
is_object=is_object,
|
|
305
|
+
)
|
|
306
|
+
# unidecode the updated entity
|
|
307
|
+
updated_entity = unidecode(updated_entity)
|
|
308
|
+
# if the updated entity is None (meaning that LLM didn't find any similar entities)
|
|
309
|
+
# -> return the original entity
|
|
310
|
+
if re.sub(r"[^\w\s]", "", updated_entity) == "None":
|
|
311
|
+
updated_entity = entity
|
|
312
|
+
else:
|
|
313
|
+
# if no similar entities -> return the original entity
|
|
314
|
+
updated_entity = entity
|
|
315
|
+
|
|
316
|
+
self.aligner.add_entity(
|
|
317
|
+
entity_name=updated_entity,
|
|
318
|
+
alias=entity,
|
|
319
|
+
entity_type=entity_type,
|
|
320
|
+
sample_id=sample_id,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
return updated_entity
|
|
324
|
+
|
|
325
|
+
def extract_triplets_with_ontology_filtering(
|
|
326
|
+
self, text, sample_id=None, source_text_id=None
|
|
327
|
+
):
|
|
328
|
+
"""
|
|
329
|
+
Extract and refine knowledge graph triplets from text using LLM.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
text (str): Input text to extract triplets from
|
|
333
|
+
sample_id (str): Sample ID - used to distinguish graphs resulted
|
|
334
|
+
in different launches/by different users
|
|
335
|
+
source_text_id (str): Optional, - used to distinguish texts from
|
|
336
|
+
different sources (for example, different paragraphs of the same text)
|
|
337
|
+
Returns:
|
|
338
|
+
tuple:
|
|
339
|
+
(initial_triplets, final_triplets, filtered_triplets, ontology_filtered_triplets)
|
|
340
|
+
"""
|
|
341
|
+
self.extractor.reset_tokens()
|
|
342
|
+
self.extractor.reset_messages()
|
|
343
|
+
self.extractor.reset_error_state()
|
|
344
|
+
|
|
345
|
+
extracted_triplets = self.extractor.extract_triplets_from_text(text)
|
|
346
|
+
|
|
347
|
+
initial_triplets = []
|
|
348
|
+
for triplet in extracted_triplets["triplets"]:
|
|
349
|
+
triplet["prompt_token_num"], triplet["completion_token_num"] = (
|
|
350
|
+
self.extractor.calculate_used_tokens()
|
|
351
|
+
)
|
|
352
|
+
triplet["source_text_id"] = source_text_id
|
|
353
|
+
triplet["sample_id"] = sample_id
|
|
354
|
+
initial_triplets.append(triplet.copy())
|
|
355
|
+
|
|
356
|
+
final_triplets = []
|
|
357
|
+
filtered_triplets = []
|
|
358
|
+
ontology_filtered_triplets = []
|
|
359
|
+
|
|
360
|
+
for triplet in extracted_triplets["triplets"]:
|
|
361
|
+
self.extractor.reset_tokens()
|
|
362
|
+
try:
|
|
363
|
+
logger.log(logging.DEBUG, "Triplet: %s\n%s" % (str(triplet), "-" * 100))
|
|
364
|
+
|
|
365
|
+
# _____________ Refine entity types __________
|
|
366
|
+
|
|
367
|
+
(
|
|
368
|
+
refined_subject_type,
|
|
369
|
+
refined_subject_type_id,
|
|
370
|
+
refined_object_type,
|
|
371
|
+
refined_object_type_id,
|
|
372
|
+
) = self._refine_entity_types(text=text, triplet=triplet)
|
|
373
|
+
|
|
374
|
+
# ________________ Refine relation ________________
|
|
375
|
+
(
|
|
376
|
+
refined_relation,
|
|
377
|
+
refined_relation_id,
|
|
378
|
+
refined_relation_direction,
|
|
379
|
+
prop_subject_type_ids,
|
|
380
|
+
prop_object_type_ids,
|
|
381
|
+
) = self._refine_relation(
|
|
382
|
+
text=text,
|
|
383
|
+
triplet=triplet,
|
|
384
|
+
refined_subject_type_id=refined_subject_type_id,
|
|
385
|
+
refined_object_type_id=refined_object_type_id,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
if refined_relation_direction == "inverse":
|
|
389
|
+
refined_subject_type_id, refined_object_type_id = (
|
|
390
|
+
refined_object_type_id,
|
|
391
|
+
refined_subject_type_id,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
# __________ Refine entity names ___________
|
|
395
|
+
backbone_triplet = {
|
|
396
|
+
"subject": (
|
|
397
|
+
triplet["subject"]
|
|
398
|
+
if refined_relation_direction == "direct"
|
|
399
|
+
else triplet["object"]
|
|
400
|
+
),
|
|
401
|
+
"relation": refined_relation,
|
|
402
|
+
"object": (
|
|
403
|
+
triplet["object"]
|
|
404
|
+
if refined_relation_direction == "direct"
|
|
405
|
+
else triplet["subject"]
|
|
406
|
+
),
|
|
407
|
+
"subject_type": (
|
|
408
|
+
refined_subject_type
|
|
409
|
+
if refined_relation_direction == "direct"
|
|
410
|
+
else refined_object_type
|
|
411
|
+
),
|
|
412
|
+
"object_type": (
|
|
413
|
+
refined_object_type
|
|
414
|
+
if refined_relation_direction == "direct"
|
|
415
|
+
else refined_subject_type
|
|
416
|
+
),
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
backbone_triplet["qualifiers"] = triplet["qualifiers"]
|
|
420
|
+
if refined_subject_type_id:
|
|
421
|
+
backbone_triplet["subject"] = self._refine_entity_name(
|
|
422
|
+
text, backbone_triplet, sample_id, is_object=False
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
if refined_object_type_id:
|
|
426
|
+
backbone_triplet["object"] = self._refine_entity_name(
|
|
427
|
+
text, backbone_triplet, sample_id, is_object=True
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
logger.log(
|
|
431
|
+
logging.DEBUG,
|
|
432
|
+
"Original subject name: %s\n%s"
|
|
433
|
+
% (str(backbone_triplet["subject"]), "-" * 100),
|
|
434
|
+
)
|
|
435
|
+
logger.log(
|
|
436
|
+
logging.DEBUG,
|
|
437
|
+
"Original object name: %s\n%s"
|
|
438
|
+
% (str(backbone_triplet["object"]), "-" * 100),
|
|
439
|
+
)
|
|
440
|
+
logger.log(
|
|
441
|
+
logging.DEBUG,
|
|
442
|
+
"Refined subject name: %s\n%s"
|
|
443
|
+
% (str(backbone_triplet["subject"]), "-" * 100),
|
|
444
|
+
)
|
|
445
|
+
logger.log(
|
|
446
|
+
logging.DEBUG,
|
|
447
|
+
"Refined object name: %s\n%s"
|
|
448
|
+
% (str(backbone_triplet["object"]), "-" * 100),
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
(
|
|
452
|
+
backbone_triplet["prompt_token_num"],
|
|
453
|
+
backbone_triplet["completion_token_num"],
|
|
454
|
+
) = self.extractor.calculate_used_tokens()
|
|
455
|
+
backbone_triplet["source_text_id"] = source_text_id
|
|
456
|
+
backbone_triplet["sample_id"] = sample_id
|
|
457
|
+
|
|
458
|
+
# ___________________________ Validate backbone triplet ___________________________
|
|
459
|
+
backbone_triplet_valid, backbone_triplet_exception_msg = (
|
|
460
|
+
self._validate_backbone(
|
|
461
|
+
backbone_triplet["subject_type"],
|
|
462
|
+
backbone_triplet["object_type"],
|
|
463
|
+
backbone_triplet["relation"],
|
|
464
|
+
refined_object_type_id,
|
|
465
|
+
refined_subject_type_id,
|
|
466
|
+
refined_relation_id,
|
|
467
|
+
prop_subject_type_ids,
|
|
468
|
+
prop_object_type_ids,
|
|
469
|
+
)
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
if backbone_triplet_valid:
|
|
473
|
+
final_triplets.append(backbone_triplet.copy())
|
|
474
|
+
logger.log(
|
|
475
|
+
logging.DEBUG,
|
|
476
|
+
"Final triplet: %s\n%s" % (str(backbone_triplet), "-" * 100),
|
|
477
|
+
)
|
|
478
|
+
else:
|
|
479
|
+
logger.log(
|
|
480
|
+
logging.ERROR,
|
|
481
|
+
"Final triplet is ontology filtered: %s\n%s"
|
|
482
|
+
% (str(backbone_triplet), "-" * 100),
|
|
483
|
+
)
|
|
484
|
+
logger.log(
|
|
485
|
+
logging.ERROR,
|
|
486
|
+
"Exception: %s" % (str(backbone_triplet_exception_msg)),
|
|
487
|
+
)
|
|
488
|
+
logger.log(
|
|
489
|
+
logging.ERROR, "Refined relation: %s" % (str(refined_relation))
|
|
490
|
+
)
|
|
491
|
+
logger.log(
|
|
492
|
+
logging.ERROR,
|
|
493
|
+
"Refined subject type: %s" % (str(refined_subject_type)),
|
|
494
|
+
)
|
|
495
|
+
logger.log(
|
|
496
|
+
logging.ERROR,
|
|
497
|
+
"Refined object type: %s" % (str(refined_object_type)),
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
backbone_triplet["exception_text"] = backbone_triplet_exception_msg
|
|
501
|
+
ontology_filtered_triplets.append(backbone_triplet.copy())
|
|
502
|
+
|
|
503
|
+
except Exception as e:
|
|
504
|
+
backbone_triplet = triplet.copy()
|
|
505
|
+
(
|
|
506
|
+
backbone_triplet["prompt_token_num"],
|
|
507
|
+
backbone_triplet["completion_token_num"],
|
|
508
|
+
) = self.extractor.calculate_used_tokens()
|
|
509
|
+
backbone_triplet["source_text_id"] = source_text_id
|
|
510
|
+
backbone_triplet["sample_id"] = sample_id
|
|
511
|
+
backbone_triplet["exception_text"] = str(e)
|
|
512
|
+
filtered_triplets.append(backbone_triplet.copy())
|
|
513
|
+
logger.log(
|
|
514
|
+
logging.INFO,
|
|
515
|
+
"Filtered triplet: %s\n%s" % (str(backbone_triplet), "-" * 100),
|
|
516
|
+
)
|
|
517
|
+
logger.log(logging.INFO, "Exception: %s" % (str(e)))
|
|
518
|
+
|
|
519
|
+
return (
|
|
520
|
+
initial_triplets,
|
|
521
|
+
final_triplets,
|
|
522
|
+
filtered_triplets,
|
|
523
|
+
ontology_filtered_triplets,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
def extract_triplets_with_ontology_filtering_and_add_to_db(
|
|
527
|
+
self, text, sample_id=None, source_text_id=None
|
|
528
|
+
):
|
|
529
|
+
"""
|
|
530
|
+
Extract and refine knowledge graph triplets from text using LLM, then add them to the database.
|
|
531
|
+
Args:
|
|
532
|
+
text (str): Input text to extract triplets from
|
|
533
|
+
sample_id (str): Sample ID - used to distinguish graphs resulted in different launches/by different users
|
|
534
|
+
source_text_id (str): Optional, - used to distinguish text from different sources (e.g., different paragraphs of the same text)
|
|
535
|
+
Returns:
|
|
536
|
+
tuple: (initial_triplets, final_triplets, filtered_triplets, ontology_filtered_triplets)
|
|
537
|
+
"""
|
|
538
|
+
(
|
|
539
|
+
initial_triplets,
|
|
540
|
+
final_triplets,
|
|
541
|
+
filtered_triplets,
|
|
542
|
+
ontology_filtered_triplets,
|
|
543
|
+
) = self.extract_triplets_with_ontology_filtering(
|
|
544
|
+
text, sample_id=sample_id, source_text_id=source_text_id
|
|
545
|
+
)
|
|
546
|
+
if len(initial_triplets) > 0:
|
|
547
|
+
self.aligner.add_initial_triplets(initial_triplets, sample_id=sample_id)
|
|
548
|
+
if len(final_triplets) > 0:
|
|
549
|
+
self.aligner.add_triplets(final_triplets, sample_id=sample_id)
|
|
550
|
+
if len(filtered_triplets) > 0:
|
|
551
|
+
self.aligner.add_filtered_triplets(filtered_triplets, sample_id=sample_id)
|
|
552
|
+
if len(ontology_filtered_triplets) > 0:
|
|
553
|
+
self.aligner.add_ontology_filtered_triplets(
|
|
554
|
+
ontology_filtered_triplets, sample_id=sample_id
|
|
555
|
+
)
|
|
556
|
+
return (
|
|
557
|
+
initial_triplets,
|
|
558
|
+
final_triplets,
|
|
559
|
+
filtered_triplets,
|
|
560
|
+
ontology_filtered_triplets,
|
|
561
|
+
)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wikontic
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: Extract a knowledge graph with LLM from texts and perform QA over the resulted KG
|
|
5
|
+
Author-email: Alla Chepurova <chepurova.data@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/screemix/Wikontic
|
|
8
|
+
Project-URL: Issues, https://github.com/screemix/Wikontic/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: streamlit
|
|
15
|
+
Requires-Dist: numpy
|
|
16
|
+
Requires-Dist: pyvis
|
|
17
|
+
Requires-Dist: python-dotenv
|
|
18
|
+
Requires-Dist: pymongo
|
|
19
|
+
Requires-Dist: openai
|
|
20
|
+
Requires-Dist: tenacity
|
|
21
|
+
Requires-Dist: pathlib
|
|
22
|
+
Requires-Dist: typing
|
|
23
|
+
Requires-Dist: unidecode
|
|
24
|
+
Requires-Dist: torch>=2.4.0
|
|
25
|
+
Requires-Dist: transformers
|
|
26
|
+
Requires-Dist: dataclasses
|
|
27
|
+
Requires-Dist: pydantic
|
|
28
|
+
Requires-Dist: accelerate
|
|
29
|
+
Requires-Dist: langchain
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+

|
|
33
|
+
|
|
34
|
+
# Wikontic
|
|
35
|
+
|
|
36
|
+
**Build ontology-aware, Wikidata-aligned knowledge graphs from raw text using LLMs**
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## 🚀 Overview
|
|
41
|
+
|
|
42
|
+
Knowledge Graphs (KGs) provide structured, verifiable representations of knowledge, enabling fact grounding and empowering large language models (LLMs) with up-to-date, real-world information. However, creating high-quality KGs from open-domain text is challenging due to issues like redundancy, inconsistency, and lack of alignment with formal ontologies.
|
|
43
|
+
|
|
44
|
+
**Wikontic** is a multi-stage pipeline for constructing ontology-aligned KGs from unstructured text using LLMs and Wikidata. It extracts candidate triples from raw text, then refines them through ontology-based typing, schema validation, and entity deduplication—resulting in compact, semantically coherent graphs.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## 📁 Repository Structure
|
|
49
|
+
|
|
50
|
+
- `preprocessing/constraint-preprocessing.ipynb`
|
|
51
|
+
Jupyter notebook for collecting constraint rules from Wikidata.
|
|
52
|
+
|
|
53
|
+
- `utils/`
|
|
54
|
+
Utilities for LLM-based triple extraction and alignment with Wikidata ontology rules.
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
- `utils/openai_utils.py`
|
|
58
|
+
`LLMTripletExtractor` class for LLM-based triple extraction.
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
### To use ontology:
|
|
62
|
+
|
|
63
|
+
- `utils/ontology_mappings/`
|
|
64
|
+
JSON files containing ontology mappings from Wikidata.
|
|
65
|
+
|
|
66
|
+
- `utils/structured_inference_with_db.py`
|
|
67
|
+
- `StructuredInferenceWithDB` class: triple extraction and qa functions
|
|
68
|
+
|
|
69
|
+
- `utils/structured_aligner.py`
|
|
70
|
+
- `Aligner` class: ontology alignment and entity name refinement
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
### Not to use ontology:
|
|
74
|
+
- `utils/inference_with_db.py`
|
|
75
|
+
- `InferenceWithDB` class: triple extraction and qa functions
|
|
76
|
+
|
|
77
|
+
- `utils/dynamic_aligner.py`
|
|
78
|
+
- `Aligner` class: entity and relation name refinement
|
|
79
|
+
|
|
80
|
+
### Evaluation:
|
|
81
|
+
- `inference_and_eval`
|
|
82
|
+
- Scripts for building KGs for MuSiQue and HotPot datasets and evaluation of QA performance
|
|
83
|
+
- `analysis`
|
|
84
|
+
- Notebooks with downstream analysis of the resulted KG
|
|
85
|
+
|
|
86
|
+
### Use Wikontic as a service:
|
|
87
|
+
|
|
88
|
+
- `pages/` and `Wikontic.py`
|
|
89
|
+
Code for the web service for knowledge graph extraction and visualization.
|
|
90
|
+
|
|
91
|
+
- `Dockerfile`
|
|
92
|
+
For building a containerized web service.
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## 🏁 Getting Started
|
|
98
|
+
|
|
99
|
+
1. **Set up the ontology and KG databases:**
|
|
100
|
+
```
|
|
101
|
+
./setup_db.sh
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
2. **Launch the web service:**
|
|
105
|
+
```
|
|
106
|
+
streamlit run Wikontic.py
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
Enjoy building knowledge graphs with Wikontic!
|